Esempio n. 1
0
    def post(self):
        stopID = self.request.get('stopID')
        lat = self.request.get('lat')
        lon = self.request.get('lon')

        stop = StopLocation()
        stop.stopID = stopID
        stop.routeID = '00'
        stop.intersection = self.request.get('intersection').upper()
        stop.location = GeoPt(lat, lon)
        stop.update_location()
        stop.direction = '00'
        logging.debug('created new stoplocation for %s' % stopID)
        stop.put()

        routeQ = db.GqlQuery("SELECT * FROM RouteListing WHERE stopID = :1",
                             stopID)
        routes = routeQ.fetch(100)
        if len(routes) > 0:
            for r in routes:
                logging.debug('updating route %s with new location' % r.route)
                r.stopLocation = stop
                r.put()

        self.redirect('http://smsmybus.com/labs/displaystops')
Esempio n. 2
0
    def post(self):
        intersection = self.request.get('intersection')
        latitude = self.request.get('latitude')
        longitude = self.request.get('longitude')
        direction = self.request.get('direction')
        routeID = self.request.get('routeID')
        stopID = self.request.get('stopID')
        logging.info("storing route %s intersection %s at lat/lon %s,%s toward %s" % 
                     (routeID,intersection,latitude,longitude,direction))
        
        if len(intersection) > 400:
            intersection = intersection.ljust(400)

        if stopID == '00' or latitude is None or longitude is None:
            # create a task event to process the error
            task = Task(url='/crawl/errortask', params={'intersection':intersection,
                                                        'location':(latitude+","+longitude),
                                                        'direction':direction,
                                                        'metaStringOne':self.request.get('crawlLine'),
                                                        'metaStringTwo':'from geotask crawler',
                                                        'routeID':routeID,
                                                        'stopID':stopID,
                                                        })
            task.add('crawlerrors')
        else:
            # ignore this stop if we've already stored it
            # stopID + routeID
            stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1 and routeID = :2", stopID, routeID).get()
            if stop is None:
                stop = StopLocation()
                stop.stopID = stopID
                stop.routeID = routeID
                stop.intersection = intersection.upper()
                stop.direction = direction.upper()
                stop.location = GeoPt(latitude,longitude)
                stop.update_location()
                stop.put()
            
                # update the route table to include a reference to the new geo data
                if stopID != '00':
                    route = db.GqlQuery("SELECT * FROM RouteListing WHERE stopID = :1 and route = :2", stopID,routeID).get()
                    if route is None:
                        logging.error("IMPOSSIBLE... no stop on record?!? stop %s, route %s" % (stopID,routeID))
                        # create a task event to process the error
                        task = Task(url='/crawl/errortask', params={'intersection':intersection,
                                                            'location':(latitude+","+longitude),
                                                            'direction':direction,
                                                            'metaStringOne':self.request.get('crawlLine'),
                                                            'metaStringTwo':'routelisting update',
                                                            'routeID':routeID,
                                                            'stopID':stopID,
                                                            })
                        task.add('crawlerrors')
                    else:
                        route.stopLocation = stop
                        route.put()

        return
Esempio n. 3
0
    def post(self):
        stopID = self.request.get('stopID')
        lat = self.request.get('lat')
        lon = self.request.get('lon')

        stop = StopLocation()
        stop.stopID = stopID
        stop.routeID = '00'
        stop.intersection = self.request.get('intersection').upper()
        stop.location = GeoPt(lat,lon)
        stop.update_location()
        stop.direction = '00'
        logging.debug('created new stoplocation for %s' % stopID)
        stop.put()
        
        routeQ = db.GqlQuery("SELECT * FROM RouteListing WHERE stopID = :1", stopID)
        routes = routeQ.fetch(100)
        if len(routes) > 0:
            for r in routes:
                logging.debug('updating route %s with new location' % r.route)
                r.stopLocation = stop
                r.put()

        self.redirect('http://smsmybus.com/labs/displaystops')
Esempio n. 4
0
    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')
            logging.debug("task scraping for %s, direction %s, route %s" % (scrapeURL,direction,routeID))
            
            loop = 0
            done = False
            result = None
            start = quota.get_request_cpu_usage()
            while not done and loop < 3:
                try:
                    # fetch the page
                    result = urlfetch.fetch(scrapeURL)
                    done = True;
                except urlfetch.DownloadError:
                    logging.info("Error loading page (%s)... sleeping" % loop)
                    if result:
                        logging.debug("Error status: %s" % result.status_code)
                        logging.debug("Error header: %s" % result.headers)
                        logging.debug("Error content: %s" % result.content)
                        time.sleep(4)
                        loop = loop+1
            end = quota.get_request_cpu_usage()
            #logging.info("scraping took %s cycles" % (end-start))

            # start to interrogate the results
            soup = BeautifulSoup(result.content)
            stopUpdates = []
            for slot in soup.html.body.findAll("a","ada"):
                logging.info("pulling out data from page... %s" % slot)

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)
                    # route crawler looks for titles with an ID# string
                    if title.find("#") > 0:
                        # we finally got down to the page we're looking for
                        
                        # pull the stopID from the page content...
                        stopID = title.split("#")[1].split("]")[0]
                        
                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()
                        
                        logging.info("found stop %s, %s" % (stopID,intersection))
                        
                        # check for conflicts...
                        stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", stopID).get()
                        if stop is None:
                            # add the new stop
                            stop = StopLocation()
                            stop.stopID = stopID
                            stop.routeID = routeID
                            stop.intersection = intersection.upper()
                            stop.direction = direction.upper()
                            stopUpdates.append(stop)  # stop.put()
                            logging.info("ADDED StopLocation (%s) - MINUS geo location" % stopID)
                        else:
                            logging.info("StopLoation entity already exists for %s..." % stopID)
                            stop.routeID = routeID
                            stopUpdates.append(stop)
                        
                        # pull the route and direction data from the URL
                        routeData = scrapeURL.split('?')[1]
                        logging.info("FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData,stopID))
                        routeArgs = routeData.split('&')
                        routeID = routeArgs[0].split('=')[1]
                        directionID = routeArgs[1].split('=')[1]
                        timeEstimatesURL = CRAWL_URLBASE + href
                    
                        # check for conflicts...
                        r = db.GqlQuery("SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3",
                                        routeID, directionID, stopID).get()
                        if r is None:
                          # add the new route to the DB
                          route = RouteListing()
                          route.route = routeID
                          route.direction = directionID
                          route.stopID = stopID
                          route.scheduleURL = timeEstimatesURL
                          route.put()
                          logging.info("added new route listing entry to the database!")
                        else:
                          logging.error("we found a duplicate entry!?! %s", r.scheduleURL)
                    #else: # title.split(",")[0].isdigit():
                    elif href.find("?r=") > -1:
                        # create a new task with this link
                        crawlURL = CRAWL_URLBASE + href
                        if routeID == '00':
                            routeID = href.split('r=')[1]
                        elif href.find("&") > -1:
                            routeID = href.split('&')[0].split('r=')[1]
                        task = Task(url='/routelist/crawlingtask', params={'crawl':crawlURL,'direction':title,'routeID':routeID})
                        task.add('crawler')
                        logging.info("Added new task for %s, direction %s, route %s" % (title.split(",")[0],title,routeID))                    
                    # label crawler looks for titles with letters for extraction/persistence
                    #elif title.replace('-','').replace(' ','').isalpha():
                    #    routeData = href.split('?')[1]
                    #    logging.info("found the route LABEL page! href: %s" % href)
                    #    routeArgs = routeData.split('&')
                    #    directionID = routeArgs[1].split('=')[1]
                    #    
                    #    l = DestinationListing.get_or_insert(title, id=directionID, label=title)

            # push the vehicle updates to the datastore
            db.put(stopUpdates)
                                        
        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return
            
        return;
Esempio n. 5
0
    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')
            logging.debug("task scraping for %s, direction %s, route %s" % (scrapeURL,direction,routeID))
            
            # fetch the URL content
            content = fetchURL(scrapeURL)
            
            # start to interrogate the results
            soup = BeautifulSoup(content)
            stopUpdates = []
            for slot in soup.html.body.findAll("a","ada"):
                #logging.info("pulling out data from page... %s" % slot)

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)
                    # route crawler looks for titles with an ID# string
                    if title.find("[ID#") > 0:
                        # we finally got down to the page we're looking for. this is a reference
                        # to a specific stop including a stopID and intersection.
                        
                        # pull the stopID from the page content...
                        stopID = title.split("ID#")[1].split("]")[0]
                        
                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()
                        
                        logging.info("found stop %s, %s" % (stopID,intersection))
                        
                        # check to see if we've already found this stop...
                        q = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1 and direction = :2 and routeID = :3", 
                                        stopID, direction.upper(), routeID)
                        stopQuery = q.fetch(1)
                        if len(stopQuery) == 0:
                            # add the new stop
                            stop = StopLocation()
                            stop.stopID = stopID
                            stop.routeID = routeID
                            stop.intersection = intersection.upper()
                            stop.direction = direction.upper()
                            stopUpdates.append(stop)  # we'll do a batch put at the end 
                            logging.info("added new stop listing MINUS geo location")
                        else:
                            logging.info("already have this stop in the table...")
                            stopQuery[0].routeID = routeID
                            stopUpdates.append(stopQuery[0])
                        
                    elif href.find("?r=") > -1:
                        # this is step #2 and #3 from the algorithm documented above. we're going to create 
                        # a new task to go off and scrape the live route data for a specific route.
                        crawlURL = CRAWL_URLBASE + href
                        if routeID == '00':
                            routeID = href.split('r=')[1]
                        elif href.find("&") > -1:
                            routeID = href.split('&')[0].split('r=')[1]
                        task = Task(url='/crawl/crawlingtask', params={'crawl':crawlURL,'direction':title,'routeID':routeID})
                        task.add('crawler')
                        logging.info("Added new task for %s, direction %s, route %s" % (title.split(",")[0],title,routeID))

            # push the StopLocation updates to the datastore
            db.put(stopUpdates)
                                        
        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return
            
        return;