def post(self): route_loader_key = self.request.get('rll_key') logging.debug('work on %s' % self.request.get('rll_key')) route_loader = RouteListingLoader.get(route_loader_key) if route_loader is None: logging.error('total fail. unable to find %s' % route_loader_key) else: logging.debug(route_loader.routeID) # find the corresponding stop details stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", route_loader.stopID).get() if stop is None: logging.error("Missing stop %s which should be impossible" % route_loader.stopID); try: url = CRAWL_URLBASE + '?r=' + route_loader.routeCode + '&d=' + route_loader.directionCode + '&s=' + route_loader.stopCode logging.debug(url) route = RouteListing() route.route = route_loader.routeID route.routeCode = route_loader.routeCode route.direction = route_loader.directionCode route.stopID = route_loader.stopID route.stopCode = route_loader.stopCode route.scheduleURL = url route.stopLocation = stop route.put() logging.info("added new route listing entry to the database!") DestinationListing.get_or_insert(route_loader.direction, id=route_loader.directionCode, label=route_loader.direction) except TransactionFailedError: logging.error('FAIL : unable to store RouteListing for route %s, stop %s', (route_loader.routeID,route_loader.stopID)) self.response.set_status(2) self.response.out.write('transaction fail') return
def post(self): try: scrapeURL = self.request.get('crawl') direction = self.request.get('direction') routeID = self.request.get('routeID') logging.debug("task scraping for %s, direction %s, route %s" % (scrapeURL, direction, routeID)) loop = 0 done = False result = None #start = quota.get_request_cpu_usage() while not done and loop < 3: try: # fetch the page result = urlfetch.fetch(scrapeURL) done = True except urlfetch.DownloadError: logging.info("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(4) loop = loop + 1 #end = quota.get_request_cpu_usage() #logging.info("scraping took %s cycles" % (end-start)) # start to interrogate the results soup = BeautifulSoup(result.content) for slot in soup.html.body.findAll("a", "ada"): logging.info("pulling out data from page... %s" % slot) if slot.has_key('href'): href = slot['href'] title = slot['title'] logging.info("FOUND A TITLE ----> %s" % title) # route crawler looks for titles with an ID# string if title.find("#") > 0: # we finally got down to the page we're looking for # pull the stopID from the page content... stopID = title.split("#")[1].split("]")[0] # pull the intersection from the page content... intersection = title.split("[")[0].strip() logging.info("found stop %s, %s" % (stopID, intersection)) # check for conflicts... stop = db.GqlQuery( "SELECT * FROM StopLocation WHERE stopID = :1", stopID).get() if stop is None: logging.error( "Missing stop %s which should be impossible" % stopID) # pull the route and direction data from the URL routeData = scrapeURL.split('?')[1] logging.info( "FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData, stopID)) routeArgs = routeData.split('&') routeID = routeArgs[0].split('=')[1] directionID = routeArgs[1].split('=')[1] timeEstimatesURL = CRAWL_URLBASE + href # check for conflicts... r = db.GqlQuery( "SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3", routeID, directionID, stopID).get() if r is None: # add the new route to the DB route = RouteListing() route.route = routeID route.direction = directionID route.stopID = stopID route.scheduleURL = timeEstimatesURL route.stopLocation = stop route.put() logging.info( "added new route listing entry to the database!" ) else: logging.error("we found a duplicate entry!?! %s", r.scheduleURL) #else: # title.split(",")[0].isdigit(): else: if href.find("?r=") > -1: # create a new task with this link crawlURL = CRAWL_URLBASE + href if routeID == '00': routeID = href.split('r=')[1] elif href.find("&") > -1: routeID = href.split('&')[0].split('r=')[1] task = Task(url='/crawl/routelist/crawlingtask', params={ 'crawl': crawlURL, 'direction': title, 'routeID': routeID }) task.add('crawler') logging.info( "Added new task for %s, direction %s, route %s" % (title.split(",")[0], title, routeID)) # label crawler looks for titles with letters for extraction/persistence if title.replace('-', '').replace(' ', '').isalpha(): logging.info( "found the route LABEL page! href: %s" % href) routeData = href.split('?')[1] routeArgs = routeData.split('&') directionID = routeArgs[1].split('=')[1] l = DestinationListing.get_or_insert( title, id=directionID, label=title) except apiproxy_errors.DeadlineExceededError: logging.error("DeadlineExceededError exception!?") return return
def post(self): try: scrapeURL = self.request.get('crawl') direction = self.request.get('direction') routeID = self.request.get('routeID') loop = 0 done = False result = None while not done and loop < 3: try: # fetch the page result = urlfetch.fetch(scrapeURL) done = True; except urlfetch.DownloadError: logging.info("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(4) loop = loop+1 # start to interrogate the results soup = BeautifulSoup(result.content) for slot in soup.html.body.findAll("a","adalink"): if slot.has_key('href'): href = slot['href'] title = slot['title'] logging.info("FOUND A TITLE ----> %s" % title) # route crawler looks for titles with an ID# string # # stop links have the following format # <a class="adalink" title="CAMPUS & BABCOCK RR [EB#0809]" href="?r=61&d=102&s=3457">CAMPUS & BABCOCK RR [EB#0809]</a> # if title.find("#") > 0: # we finally got down to the page we're looking for # pull the stopID from the page content... stopID = title.split("#")[1].split("]")[0] # pull the intersection from the page content... intersection = title.split("[")[0].strip() logging.info("found stop %s, %s" % (stopID,intersection)) # check for conflicts... stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", stopID).get() if stop is None: logging.error("Missing stop %s which should be impossible" % stopID); # pull the route and direction data from the URL routeData = scrapeURL.split('?')[1] logging.info("FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData,stopID)) routeArgs = routeData.split('&') fakeRouteID = routeArgs[0].split('=')[1] directionID = routeArgs[1].split('=')[1] timeEstimatesURL = CRAWL_URLBASE + href # check for conflicts... r = db.GqlQuery("SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3", routeID, directionID, stopID).get() if r is None: # add the new route to the DB route = RouteListing() route.route = routeID route.direction = directionID route.stopID = stopID route.scheduleURL = timeEstimatesURL route.stopLocation = stop route.put() logging.info("added new route listing entry to the database!") else: logging.error("we found a duplicate entry!?! %s", r.scheduleURL) else: # direction links look like the following, # <a class="adalink" title="CapSq" href="?r=61&d=102">CapSq</a> # # fetch the next page depth to get to the stop details # if href.find("?r=") > -1: # create a new task with this link crawlURL = CRAWL_URLBASE + href task = Task(url='/crawl/routelist/crawlingtask', params={'crawl':crawlURL,'direction':title,'routeID':routeID}) task.add('crawler') logging.info("Added new task for %s, direction %s, route %s" % (title.split(",")[0],title,routeID)) # label crawler looks for titles with letters for extraction/persistence if title.replace('-','').replace(' ','').isalpha(): directionID = href.split('d=')[0] logging.info("found the route LABEL page! href: %s" % href) l = DestinationListing.get_or_insert(title, id=directionID, label=title) except apiproxy_errors.DeadlineExceededError: logging.error("DeadlineExceededError exception!?") return return;