Example #1
0
    def post(self):

        # Create instance variable to track if parameters came from a direct request
        # Or if they came through Period entity
        self.params_from_request = None
        params = None

        s =  "Version: %s\n" % __version__
        s += "Arguments from POST:"
        for arg in self.request.arguments():
            s += '\n%s:%s' % (arg, self.request.get(arg))
        logging.info(s)

        # Try to get period from the request in case GetEvents was called directly
        self.period = self.request.get("period", None)

        # If real period not in request, try to get parameters from StatsRun entity 
        # in case GetEvents was called from a previous task.
        if self.period is None or len(self.period)==0:
            run_key = ndb.Key("StatsRun", 5759180434571264)
            run_entity = run_key.get()
            self.period = run_entity.period
            self.params_from_request = False
            s =  "Version: %s\n" % __version__
            s += "Period %s determined from StatsRun entity: %s" % (self.period, params)
            logging.info(s)
        else:
            self.params_from_request = True
            s =  "Version: %s\n" % __version__
            s += "Period %s determined from request: %s" % (self.period, self.request)
            logging.info(s)

        if self.period is None or len(self.period)==0:
            self.error(400)
            resp = {
                "status": "error",
                "message": "Period parameter was not provided."
            }
            s =  "Version: %s\n" % __version__
            s += "%s" % resp
            logging.error(s)
            self.response.write(json.dumps(resp)+"\n")
            return

        # If Period not already stored, halt
        period_key = ndb.Key("Period", self.period)
        period_entity = period_key.get()
        if not period_entity:
            self.error(400)
            resp = {
                "status": "error",
                "message": "Provided period does not exist in datastore",
                "data": {
                    "period": self.period
                }
            }
            logging.error(resp)
            self.response.write(json.dumps(resp)+"\n")
            return

        # Get the remaining parameters based on the parameter source
        if self.params_from_request == True: 
            # Get parameters from request

            # 'table_name' parameter
            try:
                self.table_name = self.request.get('table_name')
                if self.table_name is None or len(self.table_name)==0:
                    self.table_name = CDB_TABLE
            except KeyError:
                # Table name not provided, use default
                self.table_name = CDB_TABLE

            # 'downloads_extracted' parameter
            try:
                self.downloads_extracted = self.request.get('downloads_extracted').\
                    lower() == 'true'
            except Exception:
                s =  "Version: %s\n" % __version__
                s += "Aborting. "
                s += "Unable to extract 'downloads_extracted' from request: %s" % request
                logging.error(s)
                return

            # 'searches_extracted' parameter
            try:
                self.searches_extracted = self.request.get('searches_extracted').\
                    lower() == 'true'
            except KeyError:
                s =  "Version: %s\n" % __version__
                s += "Aborting. "
                s += "Unable to extract 'searches_extracted' from request: %s" % request
                logging.error(s)
                return
        else:
            # Get parameters from Period entity

            # 'table_name' parameter
            try:
                self.table_name = period_entity.table_name
                if self.table_name is None or len(self.table_name)==0:
                    self.table_name = CDB_TABLE
            except KeyError:
                # default value for 'table_name' if not provided is None
                self.table_name = CDB_TABLE

            # 'downloads_extracted' parameter
            try:
                self.downloads_extracted = period_entity.downloads_extracted
            except Exception:
                s =  "Version: %s\n" % __version__
                s += "Aborting. "
                s += "Unable to extract 'downloads_extracted' from Period"
                logging.error(s)
                return

            # 'searches_extracted' parameter
            try:
                self.searches_extracted = period_entity.searches_extracted
            except Exception:
                s =  "Version: %s\n" % __version__
                s += "Aborting. "
                s += "Unable to extract 'searches_extracted' from Period"
                logging.error(s)
                return

        s =  "Version: %s\n" % __version__
        s += "Using %s as data table" % self.table_name
        logging.info(s)

        # Start with downloads
        if self.downloads_extracted == False:
            self.t = "download"
        # and continue with searches
        elif self.searches_extracted == False:
            self.t = "search"
        # if both are True, downloads and searches were both extracted...
        else:
            # ... call 'process_events' and move on
            taskqueue.add(url=URI_PROCESS_EVENTS, queue_name=QUEUENAME)
            return

        # Get events
        s =  "Version: %s\n" % __version__
        s += "Getting events"
        logging.info(s)
        err = self.get_events()
        if err:
            s =  "Version: %s\n" % __version__
            s += "Error from get_events(): %s" % err
            logging.error(s)
            return

        # Parse events
        s =  "Version: %s\n" % __version__
        s += "Parsing events"
        logging.info(s)
        err = self.parse_events()
        if err:
            s =  "Version: %s\n" % __version__
            s += "Error from parse_events(): %s" % err
            logging.error(s)
            return

        # Update Period counts
        s =  "Version: %s\n" % __version__
        s += "Updating Period counts"
        logging.info(s)
        err = self.update_period_counts()
        if err:
            s =  "Version: %s\n" % __version__
            s += "Error from update_period_counts(): %s" % err
            logging.error(s)
            return

        r = []
        for resource in self.resources:
            params = {
                "t": self.t,
                "gbifdatasetid": resource,
                "resource": self.resources[resource]
            }
            r.append(ReportToProcess(**params))

        # Store temporary entities
        s =  "Version: %s\n" % __version__
        s += "Putting %d entities" % len(r)
        logging.info(s)
        sr = ndb.put_multi(r)

        # Check
        if len(sr) != len(r):
            s =  "Version: %s\n" % __version__
            s += "Not all resources were put to process."
            logging.error(s)
            self.error(500)
            resp = {
                "status": "error",
                "message": s,
                "data": {
                    "period": self.period,
                    "t": self.t,
                    "resources": len(r),
                    "to_process": len(sr)
                }
            }
            self.response.write(json.dumps(resp) + "\n")
            return

        # Build response
        resp = {
            "status": "success",
            "message": "All %s events downloaded and parsed" % self.t,
            "data": {
                "period": self.period,
                "event_type": self.t,
                "event_number": len(self.data),
                "resources_to_process": len(self.resources)
            }
        }
        self.response.write(json.dumps(resp) + "\n")

        # Update Period entity with stat information
        if self.t == "search":
            period_entity.searches_extracted=True
        else:
            period_entity.downloads_extracted=True

        k = period_entity.put()
        if k != period_key:
            s =  "Version: %s\n" % __version__
            s += "Could not update processing properties in period %s" % self.period
            logging.error(s)
            self.error(500)
            resp = {
                "status": "error",
                "message": s,
                "data": {
                    "period": self.period,
                }
            }
            self.response.write(json.dumps(resp) + "\n")
            return 1
        
        # If both downloads and searches have been extracted, end now
        period_entity = period_key.get()
        if period_entity.searches_extracted is True and\
           period_entity.downloads_extracted is True:
            # Call 'process_events'
            s =  "Version: %s\n" % __version__
            s += "All searches and downloads extracted"
            logging.info(s)
            taskqueue.add(url=URI_PROCESS_EVENTS, queue_name=QUEUENAME)
        else:
            taskqueue.add(url=URI_GET_EVENTS, queue_name=QUEUENAME)
        return
Example #2
0
    def post(self):

        # Retrieve parameters from memcache and request
        memcache_keys = ["period", "github_store", "github_issue"]
        params = memcache.get_multi(memcache_keys,
                                    key_prefix="usagestats_parser_")
        self.period = params['period']
        self.github_store = params['github_store']
        self.github_issue = params['github_issue']

        # Start the loop, until deadline
        try:

            # Prepare query for all Reports to process
            query = ReportToProcess.query()
            query = query.order(ReportToProcess.gbifdatasetid)
            logging.info("ReportToProcess queried")

            # Get cursor from request, if any
            cursor_str = self.request.get('cursor', None)
            cursor = None
            if cursor_str:
                cursor = Cursor(urlsafe=cursor_str)
            logging.info("Cursor built: %s" % cursor)

            # Initialize loop
            more = True

            # Repeat while there are reports to process
            while more is True:

                # Get the next (or first) round of elements
                logging.info("Fetching %d entities" % PAGE_SIZE)
                results, new_cursor, more = query.fetch_page(
                    PAGE_SIZE, start_cursor=cursor
                )
                logging.info("Got %d results" % len(results))

                # Process and store transactionally
                self.process_and_store(results)

                # Restart with new cursor (if any)
                if more is True:
                    cursor = new_cursor
                    logging.info("New cursor: %s" % cursor.urlsafe())

            logging.info("Finished processing reports")

            # Store memcache'd counts
            counts = memcache.get_multi([
                "processed_searches",
                "processed_downloads"
                ], key_prefix="usagestats_parser_")
            period_entity = ndb.Key("Period", self.period).get()
            period_entity.processed_searches = counts['processed_searches']
            period_entity.processed_downloads = counts['processed_downloads']

            resp = {
                "status": "success",
                "message": "Successfully finished processing all reports",
                "data": {
                    "processed_searches": counts['processed_searches'],
                    "processed_downloads": counts['processed_downloads']
                }
            }

            # Launch process to store reports on GitHub, if applicable
            if self.github_store is True:
                resp['message'] += ". Launching GitHub storing process"
                taskqueue.add(url=URI_GITHUB_STORE,
                              queue_name=QUEUENAME)

            # Launch process to create issues on GitHub, if applicable
            elif self.github_issue is True:
                resp['message'] += ". Launching GitHub issue process"
                taskqueue.add(url=URI_GITHUB_ISSUE,
                              queue_name=QUEUENAME)

            # Otherwise, consider finished
            else:
                resp['message'] += ". No GitHub process launched"
                period_entity.status = "done"
                mail.send_mail(
                    sender=EMAIL_SENDER,
                    to=EMAIL_RECIPIENT,
                    subject="Usage reports for period %s" % self.period,
                    body="""
Hey there!

Just a brief note to let you know the extraction of %s stats has successfully
finished, with no GitHub processes launched.

Congrats!
""" % self.period)

            # In any case, store the counts, show message and finish
            period_entity.put()
            logging.info(resp)
            self.response.write(json.dumps(resp)+"\n")

            return

        # When timeout arrives...
        except DeadlineExceededError:
            # Launch new instance with current (failed) cursor
            taskqueue.add(url=URI_PROCESS_EVENTS,
                          params={"cursor": cursor.urlsafe()},
                          queue_name=QUEUENAME)
            logging.info("Caught a DeadlineExceededError. Relaunching")

            resp = {
                "status": "in progress",
                "message": "Caught a DeadlineExceededError."
                           " Relaunching with new cursor",
                "data": {
                    "period": self.period,
                    "cursor": cursor.urlsafe()
                }
            }
            logging.info(resp)
            self.response.write(json.dumps(resp)+"\n")

        return
Example #3
0
    def initialize_extraction(self, period=None, force=None):
        """Check if Period parameter is valid, if the Period entity already exists
           and create a new Period.
        """
        self.response.headers['Content-Type'] = "application/json"

        # Check that 'period' is provided
        if not self.period:
            s = "Version: %s\n" % __version__
            s += "Period not found on POST body. Aborting."
            logging.error(s)
            self.error(400)
            resp = {"status": "error", "message": s}
            self.response.write(json.dumps(resp) + "\n")
            return 1

        # Check that 'period' is valid
        if len(self.period) != 6:
            s = "Version: %s\n" % __version__
            s += "Malformed period. Should be YYYYMM (e.g., 201603)"
            logging.error(s)
            self.error(400)
            resp = {"status": "error", "message": s}
            self.response.write(json.dumps(resp) + "\n")
            return 1

        # Get existing period
        period_key = ndb.Key("Period", self.period)
        period_entity = period_key.get()

        # If existing, abort or clear and start from scratch
        if period_entity:
            if self.force is not True:
                s = "Version: %s\n" % __version__
                s += "Period %s already exists. " % self.period
                s += "Aborting. To override, use 'force=true'."
                logging.error(s)
                resp = {"status": "error", "message": s}
                self.response.write(json.dumps(resp) + "\n")
                return 1
            else:
                s = "Version: %s\n" % __version__
                s += "Period %s already exists. " % self.period
                s += "Overriding."
                logging.warning(s)

                # Delete Reports referencing period
                r = Report.query().filter(Report.reported_period == period_key)
                to_delete = r.fetch(keys_only=True)
                s = "Version: %s\n" % __version__
                s += "Deleting %d Report entities" % len(to_delete)
                logging.info(s)
                deleted = ndb.delete_multi(to_delete)
                s = "Version: %s\n" % __version__
                s += "%d Report entities removed" % len(deleted)
                logging.info(s)

                # Delete Period itself
                s = "Version: %s\n" % __version__
                s += "Deleting Period %s" % period_key
                logging.info(s)
                period_key.delete()
                s = "Version: %s\n" % __version__
                s += "Period %s deleted" % period_key
                logging.info(s)

        # Create new Period (id=YYYYMM)
        s = "Version: %s\n" % __version__
        s += "Creating new Period %s" % self.period
        logging.info(s)
        y, m = (int(self.period[:4]), int(self.period[-2:]))
        p = Period(id=self.period)
        p.year = y
        p.month = m
        p.status = 'in progress'
        period_key = p.put()

        # Check
        if period_key:
            s = "Version: %s\n" % __version__
            s += "New Period %s created successfully" % self.period
            s += "with key %s" % period_key
            logging.info(s)
        else:
            self.error(500)
            s = "Version: %s\n" % __version__
            s += "Could not create new Period %s" % self.period
            logging.error(s)
            resp = {"status": "error", "message": s}
            self.response.write(json.dumps(resp) + "\n")
            return 1

        # Clear temporary entities
        keys_to_delete = ReportToProcess.query().fetch(keys_only=True)
        s = "Version: %s\n" % __version__
        s += "Deleting %d temporal (internal use only) entities" % len(
            keys_to_delete)
        logging.info(s)
        ndb.delete_multi(keys_to_delete)
        return 0
Example #4
0
    def post(self):

        s = "Version: %s\n" % __version__
        s += "Arguments from POST:"
        for arg in self.request.arguments():
            s += '\n%s:%s' % (arg, self.request.get(arg))
        logging.info(s)

        # Try to get period from the request in case GetEvents was called directly
        try:
            self.period = self.request.get("period").lower()
            s = "Version: %s\n" % __version__
            s += "Period %s determined from request: %s" % (self.period,
                                                            self.request)
            logging.info(s)
        except Exception:
            pass

        # If real period not in request, try to get parameters from StatsRun entity
        # in case GetEvents was called from a previous task.
        if self.period is None or len(self.period) == 0:
            run_key = ndb.Key("StatsRun", 5759180434571264)
            run_entity = run_key.get()
            self.period = run_entity.period

        if self.period is None or len(self.period) == 0:
            self.error(400)
            resp = {
                "status": "error",
                "message": "Period parameter was not provided."
            }
            s = "Version: %s\n" % __version__
            s += "%s" % resp
            logging.error(s)
            self.response.write(json.dumps(resp) + "\n")
            return

        # If Period not already stored, halt
        period_key = ndb.Key("Period", self.period)
        period_entity = period_key.get()
        if not period_entity:
            self.error(400)
            resp = {
                "status": "error",
                "message": "Provided period does not exist in datastore",
                "data": {
                    "period": self.period
                }
            }
            logging.error(resp)
            self.response.write(json.dumps(resp) + "\n")
            return

        self.github_store = period_entity.github_store
        self.github_issue = period_entity.github_issue

        # Start the loop, until deadline
        try:

            # Prepare query for all Reports to process
            query = ReportToProcess.query()
            query = query.order(ReportToProcess.gbifdatasetid)
            s = "Version: %s\n" % __version__
            s += "ReportToProcess queried"
            logging.info(s)

            # Get cursor from request, if any
            cursor_str = self.request.get('cursor', None)
            cursor = None
            if cursor_str:
                cursor = Cursor(urlsafe=cursor_str)
            s = "Version: %s\n" % __version__
            s += "Cursor built: %s" % cursor
            logging.info(s)

            # Initialize loop
            more = True

            # Repeat while there are reports to process
            while more is True:

                # Get the next (or first) round of elements
                logging.info("Fetching %d entities" % PAGE_SIZE)
                results, new_cursor, more = query.fetch_page(
                    PAGE_SIZE, start_cursor=cursor)
                s = "Version: %s\n" % __version__
                s += "Got %d results" % len(results)
                logging.info(s)

                # Process and store transactionally
                self.process_and_store(results)

                # Restart with new cursor (if any)
                if more is True:
                    cursor = new_cursor
                    s = "Version: %s\n" % __version__
                    s += "New cursor: %s" % cursor.urlsafe()
                    logging.info(s)

            s = "Version: %s\n" % __version__
            s += "Finished processing reports"
            logging.info(s)

            period_entity = ndb.Key("Period", self.period).get()

            resp = {
                "status": "success",
                "message": "Successfully finished processing all reports",
                "data": {
                    "processed_searches": period_entity.processed_searches,
                    "processed_downloads": period_entity.processed_downloads
                }
            }

            # Launch process to store reports on GitHub, if applicable
            if self.github_store is True:
                resp['message'] += ". Launching GitHub storing process"
                taskqueue.add(url=URI_GITHUB_STORE, queue_name=QUEUENAME)

            # Launch process to create issues on GitHub, if applicable
            elif self.github_issue is True:
                resp['message'] += ". Launching GitHub issue process"
                taskqueue.add(url=URI_GITHUB_ISSUE, queue_name=QUEUENAME)

            # Otherwise, consider finished
            else:
                resp['message'] += ". No GitHub process launched"
                period_entity.status = "done"
                mail.send_mail(sender=EMAIL_SENDER,
                               to=EMAIL_RECIPIENT,
                               subject="Usage reports for period %s" %
                               self.period,
                               body="""
Hey there!

Just a brief note to let you know the extraction of %s stats has 
successfully finished, with no GitHub processes launched.

Congrats!
""" % self.period)

            # In any case, store the status, show message and finish
            period_entity.put()
            logging.info(resp)
            self.response.write(json.dumps(resp) + "\n")

            return

        # When timeout arrives...
        except DeadlineExceededError:
            # Launch new instance with current (failed) cursor
            taskqueue.add(url=URI_PROCESS_EVENTS,
                          params={"cursor": cursor.urlsafe()},
                          queue_name=QUEUENAME)
            s = "Version: %s\n" % __version__
            s += "Caught a DeadlineExceededError. Relaunching"
            logging.warning(s)

            resp = {
                "status": "in progress",
                "message": "Caught a DeadlineExceededError."
                " Relaunching with new cursor",
                "data": {
                    "period": self.period,
                    "cursor": cursor.urlsafe()
                }
            }
            logging.info(resp)
            self.response.write(json.dumps(resp) + "\n")

        return
Example #5
0
    def initialize_extraction(self, period=None, force=None):
        """Check if Period parameter is valid, if the Period entity already exists
and create a new Period."""
        self.response.headers['Content-Type'] = "application/json"

        # Check that 'period' is provided
        if not self.period:
            logging.error("Period not found on POST body. Aborting.")
            self.error(400)
            resp = {
                "status": "error",
                "message": "Period not found on POST body. " +
                           "Aborting."
            }
            self.response.write(json.dumps(resp) + "\n")
            return 1

        # Check that 'period' is valid
        if len(self.period) != 6:
            self.error(400)
            resp = {
                "status": "error",
                "message": "Malformed period. Should be YYYYMM (e.g., 201603)"
            }
            self.response.write(json.dumps(resp) + "\n")
            return 1

        # Get existing period
        period_key = ndb.Key("Period", self.period)
        period_entity = period_key.get()

        # If existing, abort or clear and start from scratch
        if period_entity:
            if self.force is not True:
                logging.error("Period %s already exists. " % self.period +
                              "Aborting. To override, use 'force=true'.")
                resp = {
                    "status": "error",
                    "message": "Period %s already exists. " % self.period +
                               "Aborting. To override, use 'force=true'."
                }
                self.response.write(json.dumps(resp) + "\n")
                return 1
            else:
                logging.warning("Period %s already exists. " % self.period +
                                "Overriding.")
                # Delete Reports referencing period
                r = Report.query().filter(Report.reported_period == period_key)
                to_delete = r.fetch(keys_only=True)
                logging.info("Deleting %d Report entities" % len(to_delete))
                deleted = ndb.delete_multi(to_delete)
                logging.info("%d Report entities removed" % len(deleted))

                # Delete Period itself
                logging.info("Deleting Period %s" % period_key)
                period_key.delete()
                logging.info("Period entity deleted")

        # Create new Period (id=YYYYMM)
        logging.info("Creating new Period %s" % self.period)
        y, m = (int(self.period[:4]), int(self.period[-2:]))
        p = Period(id=self.period)
        p.year = y
        p.month = m
        p.status = 'in progress'
        period_key = p.put()

        # Check
        if period_key:
            logging.info("New Period %s created successfully." % self.period)
            logging.info("New period's key = %s" % period_key)
        else:
            self.error(500)
            logging.error("Could not create new Period %s" % self.period)
            resp = {
                "status": "error",
                "message": "Could not create new Period %s" % self.period
            }
            self.response.write(json.dumps(resp) + "\n")
            return 1

        # Clear temporary entities
        keys_to_delete = ReportToProcess.query().fetch(keys_only=True)
        logging.info("Deleting %d temporal (internal use only) entities"
                     % len(keys_to_delete))
        ndb.delete_multi(keys_to_delete)

        return 0