Ejemplo n.º 1
0
    def post(self):
        route_loader_key = self.request.get('rll_key')
        logging.debug('work on %s' % self.request.get('rll_key'))
        route_loader = RouteListingLoader.get(route_loader_key)
        if route_loader is None:
            logging.error('total fail. unable to find %s' % route_loader_key)
        else:
            logging.debug(route_loader.routeID)
            # find the corresponding stop details
            stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", route_loader.stopID).get()
            if stop is None:
              logging.error("Missing stop %s which should be impossible" % route_loader.stopID);

            try:
                url = CRAWL_URLBASE + '?r=' + route_loader.routeCode + '&d=' + route_loader.directionCode + '&s=' + route_loader.stopCode
                logging.debug(url)
                route = RouteListing()
                route.route = route_loader.routeID
                route.routeCode = route_loader.routeCode
                route.direction = route_loader.directionCode
                route.stopID = route_loader.stopID
                route.stopCode = route_loader.stopCode
                route.scheduleURL = url
                route.stopLocation = stop
                route.put()
                logging.info("added new route listing entry to the database!")

                DestinationListing.get_or_insert(route_loader.direction, id=route_loader.directionCode, label=route_loader.direction)
            except TransactionFailedError:
                logging.error('FAIL : unable to store RouteListing for route %s, stop %s', (route_loader.routeID,route_loader.stopID))
                self.response.set_status(2)
                self.response.out.write('transaction fail')

        return
Ejemplo n.º 2
0
    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')
            logging.debug("task scraping for %s, direction %s, route %s" %
                          (scrapeURL, direction, routeID))

            loop = 0
            done = False
            result = None
            #start = quota.get_request_cpu_usage()
            while not done and loop < 3:
                try:
                    # fetch the page
                    result = urlfetch.fetch(scrapeURL)
                    done = True
                except urlfetch.DownloadError:
                    logging.info("Error loading page (%s)... sleeping" % loop)
                    if result:
                        logging.debug("Error status: %s" % result.status_code)
                        logging.debug("Error header: %s" % result.headers)
                        logging.debug("Error content: %s" % result.content)
                        time.sleep(4)
                        loop = loop + 1
            #end = quota.get_request_cpu_usage()
            #logging.info("scraping took %s cycles" % (end-start))

            # start to interrogate the results
            soup = BeautifulSoup(result.content)
            for slot in soup.html.body.findAll("a", "ada"):
                logging.info("pulling out data from page... %s" % slot)

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)
                    # route crawler looks for titles with an ID# string
                    if title.find("#") > 0:
                        # we finally got down to the page we're looking for

                        # pull the stopID from the page content...
                        stopID = title.split("#")[1].split("]")[0]

                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()

                        logging.info("found stop %s, %s" %
                                     (stopID, intersection))

                        # check for conflicts...
                        stop = db.GqlQuery(
                            "SELECT * FROM StopLocation WHERE stopID = :1",
                            stopID).get()
                        if stop is None:
                            logging.error(
                                "Missing stop %s which should be impossible" %
                                stopID)

                        # pull the route and direction data from the URL
                        routeData = scrapeURL.split('?')[1]
                        logging.info(
                            "FOUND THE PAGE ---> arguments: %s stopID: %s" %
                            (routeData, stopID))
                        routeArgs = routeData.split('&')
                        routeID = routeArgs[0].split('=')[1]
                        directionID = routeArgs[1].split('=')[1]
                        timeEstimatesURL = CRAWL_URLBASE + href

                        # check for conflicts...
                        r = db.GqlQuery(
                            "SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3",
                            routeID, directionID, stopID).get()
                        if r is None:
                            # add the new route to the DB
                            route = RouteListing()
                            route.route = routeID
                            route.direction = directionID
                            route.stopID = stopID
                            route.scheduleURL = timeEstimatesURL
                            route.stopLocation = stop
                            route.put()
                            logging.info(
                                "added new route listing entry to the database!"
                            )
                        else:
                            logging.error("we found a duplicate entry!?! %s",
                                          r.scheduleURL)
                    #else: # title.split(",")[0].isdigit():
                    else:
                        if href.find("?r=") > -1:
                            # create a new task with this link
                            crawlURL = CRAWL_URLBASE + href
                            if routeID == '00':
                                routeID = href.split('r=')[1]
                            elif href.find("&") > -1:
                                routeID = href.split('&')[0].split('r=')[1]
                            task = Task(url='/crawl/routelist/crawlingtask',
                                        params={
                                            'crawl': crawlURL,
                                            'direction': title,
                                            'routeID': routeID
                                        })
                            task.add('crawler')
                            logging.info(
                                "Added new task for %s, direction %s, route %s"
                                % (title.split(",")[0], title, routeID))
                        # label crawler looks for titles with letters for extraction/persistence
                        if title.replace('-', '').replace(' ', '').isalpha():
                            logging.info(
                                "found the route LABEL page! href: %s" % href)
                            routeData = href.split('?')[1]
                            routeArgs = routeData.split('&')
                            directionID = routeArgs[1].split('=')[1]

                            l = DestinationListing.get_or_insert(
                                title, id=directionID, label=title)

        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return

        return
Ejemplo n.º 3
0
    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')

            loop = 0
            done = False
            result = None
            while not done and loop < 3:
                try:
                    # fetch the page
                    result = urlfetch.fetch(scrapeURL)
                    done = True;
                except urlfetch.DownloadError:
                    logging.info("Error loading page (%s)... sleeping" % loop)
                    if result:
                        logging.debug("Error status: %s" % result.status_code)
                        logging.debug("Error header: %s" % result.headers)
                        logging.debug("Error content: %s" % result.content)
                        time.sleep(4)
                        loop = loop+1

            # start to interrogate the results
            soup = BeautifulSoup(result.content)
            for slot in soup.html.body.findAll("a","adalink"):

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)

                    # route crawler looks for titles with an ID# string
                    #
                    # stop links have the following format
                    #    <a class="adalink" title="CAMPUS &amp; BABCOCK RR [EB#0809]" href="?r=61&amp;d=102&amp;s=3457">CAMPUS &amp; BABCOCK RR [EB#0809]</a>
                    #
                    if title.find("#") > 0:
                        # we finally got down to the page we're looking for

                        # pull the stopID from the page content...
                        stopID = title.split("#")[1].split("]")[0]

                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()

                        logging.info("found stop %s, %s" % (stopID,intersection))

                        # check for conflicts...
                        stop = db.GqlQuery("SELECT * FROM StopLocation WHERE stopID = :1", stopID).get()
                        if stop is None:
                          logging.error("Missing stop %s which should be impossible" % stopID);

                        # pull the route and direction data from the URL
                        routeData = scrapeURL.split('?')[1]
                        logging.info("FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData,stopID))
                        routeArgs = routeData.split('&')
                        fakeRouteID = routeArgs[0].split('=')[1]
                        directionID = routeArgs[1].split('=')[1]
                        timeEstimatesURL = CRAWL_URLBASE + href

                        # check for conflicts...
                        r = db.GqlQuery("SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3",
                                        routeID, directionID, stopID).get()
                        if r is None:
                          # add the new route to the DB
                          route = RouteListing()
                          route.route = routeID
                          route.direction = directionID
                          route.stopID = stopID
                          route.scheduleURL = timeEstimatesURL
                          route.stopLocation = stop
                          route.put()
                          logging.info("added new route listing entry to the database!")
                        else:
                          logging.error("we found a duplicate entry!?! %s", r.scheduleURL)
                    else:
                        # direction links look like the following,
                        #    <a class="adalink" title="CapSq" href="?r=61&amp;d=102">CapSq</a>
                        #
                        # fetch the next page depth to get to the stop details
                        #

                        if href.find("?r=") > -1:
                            # create a new task with this link
                            crawlURL = CRAWL_URLBASE + href
                            task = Task(url='/crawl/routelist/crawlingtask', params={'crawl':crawlURL,'direction':title,'routeID':routeID})
                            task.add('crawler')
                            logging.info("Added new task for %s, direction %s, route %s" % (title.split(",")[0],title,routeID))

                        # label crawler looks for titles with letters for extraction/persistence
                        if title.replace('-','').replace(' ','').isalpha():
                            directionID = href.split('d=')[0]
                            logging.info("found the route LABEL page! href: %s" % href)
                            l = DestinationListing.get_or_insert(title, id=directionID, label=title)

        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return

        return;