コード例 #1
0
 def dataReceived(self, data):
     # probably should be using LineReceiver
     lines  = (self.bufferclient+data).split("\r\n")
     self.bufferclient = lines.pop(-1)
     for lline in lines:
         line = lline.strip()
         
         # handle case where we have an http connection rather than plain socket connection
         if not self.clienttype and line[:4] == 'GET ':
             self.clienttype = "httpget"
             self.factory.clientConnectionRegistered(self)
             self.httpheaders = [ ]
             self.httpgetreplied = False
             
         if self.clienttype == "httpget":
             self.handlehttpgetcase(line)
         
             # otherwise this is a message coming in from django or the browser editor
             # (though the django ones could be done with an httppost or something)
         elif line:
             try:
                 parsed_data = json.loads(line)
             except ValueError:
                 logger.warning("Nonparsable command ")
                 logger.info("Bad json parsing: client# %d %s" % (self.clientnumber, str([line[:1000]])))
                 self.writejson({'content':"Command not json parsable:  %s " % line, 'message_type':'console'})
                 continue
             if type(parsed_data) != dict or 'command' not in parsed_data:
                 logger.info("Bad json parsing not dict: client# %d %s" % (self.clientnumber, str([line[:1000]])))
                 self.writejson({'content':"Command not json dict with command:  %s " % line, 'message_type':'console'})
                 continue
             command = parsed_data.get('command')
             self.clientcommand(command, parsed_data)
コード例 #2
0
 def connectionLost(self, reason):
     if reason.type in [ResponseDone, PotentialDataLoss]:
             # sometimes need to print huge amount out to read the html formatted error
         #logger.debug("updaterunobject response: %s"% str("".join(self.rbuffer)[:50000]))
         logger.debug("updaterunobject response: %s"% str(["".join(self.rbuffer)[:2000]]))
     else:
         logger.warning("nope "+str([reason.getErrorMessage(), reason.type, self.rbuffer]))
     self.finished.callback(None)
コード例 #3
0
 def connectionLost(self, reason):
     if reason.type in [ResponseDone, PotentialDataLoss]:
         try:
             jdata = json.loads("".join(self.rbuffer))
             if "language" in jdata:
                 jdata["language"] = jdata["language"].lower()
             self.factory.requestoverduescrapersAction(jdata)
         except ValueError:
             #logger.warning("".join(self.rbuffer))
             logger.warning("request overdue bad json: "+str(self.rbuffer)[:1000]+" "+str(reason.type))
     else:
         logger.warning("nope "+str([reason.getErrorMessage(), reason.type, self.rbuffer]))
     self.finished.callback(None)
コード例 #4
0
class ScheduledRunMessageLoopHandler:
    
    def __init__(self, client, username, agent, runid, rev):
        # a partial implementation of editor.js
        self.exceptionmessage = [ ]
        self.completiondata = None
        self.outputmessage = [ ]
        self.domainscrapes = { }  # domain: { "pages_scraped":0, "bytes_scraped":0 }
        self.discarded_lines = 0
        self.discarded_characters = 0

        self.output = ""
        self.run_ended = None

        self.upost = {"django_key":djangokey}
        self.upost["pages_scraped"] = 0
        self.upost["records_produced"] = 0
        self.upost["scrapername"] = client.scrapername
        self.upost["clientnumber"] = client.clientnumber
        self.upost['runID'] = runid
        self.upost['run_id'] = runid
        self.upost['revision'] = rev
        
        self.username = username
        self.agent = agent

    def updaterunobjectFailure(self, failure):
            # could check for the type and use the retry functionality that exists in twister
        logger.info("requestoverduescrapers failure received "+str(failure))
    
    def updaterunobjectResponse(self, response):
        finished = Deferred()
        response.deliverBody(updaterunobjectReceiver(finished))
        return finished

    def updaterunobject(self, bfinished):
        url = urlparse.urljoin(djangourl, 'scraper-admin/twistermakesrunevent/')
        logger.info("attempting to post: "+url)
        self.upost["output"] = self.output
        if bfinished:
            self.upost["exitstatus"] = self.exceptionmessage and 'exceptionmessage' or 'done'
            self.upost["domainscrapes"] = json.dumps(self.domainscrapes)
            if self.exceptionmessage:
                self.upost['exceptionmessage'] = self.exceptionmessage
                
        # urllib.urlencode applies str() to each value in the list, which is dumb.
        # to get a proper error, print some chinese characters
        # need to get an explanation for this design of urlencode
        lupost = self.upost.copy()
        for key in lupost:
            if type(lupost[key]) == unicode:
                lupost[key] = lupost[key].encode("utf8")
        ulupost = urllib.urlencode(lupost)
        
        logger.info(self.upost)
        d = self.agent.request('POST', url, Headers(), StringProducer(ulupost))
        d.addCallbacks(self.updaterunobjectResponse, self.updaterunobjectFailure)
        
    def receiveline(self, line):
        try:
            data = json.loads(line)
            if not isinstance(data, dict):
                raise TypeError('Incorrect type of JSON')
        except Exception, e:
            logger.debug( "Failed to json.loads() %s %s" % (str([line]), str(e)))
            return
                    
        message_type = data.get('message_type')
        content = data.get("content")
        if message_type == 'executionstatus':
            if content == "startingrun":
                self.output = "%sEXECUTIONSTATUS: uml=%s usename=%s runid=%s\n" % (self.output, data.get("uml"), self.username, data.get("runID"))
                self.updaterunobject(False)

                # generated by scriptmanager executor.js
            elif content == "runcompleted":
                logger.debug( "Got run completed : %s" % (line,)  )                
                self.completiondata = data
                self.completionmessage = '';
                if data.get('elapsed_seconds'):
                    self.completionmessage += str(data.get("elapsed_seconds")) + " seconds elapsed, " 
                if data.get("CPU_seconds", False): # Until we can get CPU used
                    self.completionmessage += str(data.get("CPU_seconds")) + " CPU seconds used";
                if "exit_status" in data and data.get("exit_status") != 0:
                    self.completionmessage += ", exit status " + str(data.get("exit_status"));
                if "term_sig_text" in data:
                    self.completionmessage += ", terminated by " + data.get("term_sig_text");
                elif "term_sig" in data:
                    self.completionmessage += ", terminated by signal " + str(data.get("term_sig"));
                logger.debug( "Completion status : %s" % (line,)  )

                # generated by twister after completion
            elif content == "runfinished":
                # run object is updated following the schedulecompleted() function call
                pass  
                
            else:
                logger.warning("Unrecognized message: %s %s" % (message_type, content))
            
        elif message_type == "sources":
            self.upost["pages_scraped"] += 1  # soon to be deprecated 
            
            url = data.get('url')
            netloc = "%s://%s" % urlparse.urlparse(url)[:2]
            if "first_url_scraped" not in self.upost and url and netloc[-16:] != '.scraperwiki.com' and url[-10:] != 'robots.txt':
                self.upost["first_url_scraped"] = data.get('url')
            if netloc:
                if netloc not in self.domainscrapes:
                    self.domainscrapes[netloc] = { "pages_scraped":0, "bytes_scraped":0 }
                self.domainscrapes[netloc]["pages_scraped"] += 1
                self.domainscrapes[netloc]["bytes_scraped"] += int(data.get('bytes'))
        
        elif message_type == "data":
            self.upost["records_produced"] += 1
        
        elif message_type == "sqlitecall":
            if data.get('insert'):
                self.upost["records_produced"] += 1
        
            # only one of these ever
        elif message_type == "exception":   
            self.upost["exception_message"] = data.get('exceptiondescription')
            for stackentry in data.get("stackdump"):
                sMessage = stackentry.get('file')
                if sMessage:
                    if sMessage == "<string>":
                        sMessage = "Line %d: %s" % (stackentry.get('linenumber', -1), stackentry.get('linetext'))
                    if stackentry.get('furtherlinetext'):
                        sMessage += " -- " + stackentry.get('furtherlinetext') 
                    self.exceptionmessage.append(sMessage)
                if stackentry.get('duplicates') and stackentry.get('duplicates') > 1:
                    self.exceptionmessage.append("  + %d duplicates" % stackentry.get('duplicates'))
            
            if data.get("blockedurl"):
                self.exceptionmessage.append("Blocked URL: %s" % data.get("blockedurl"))
            self.exceptionmessage.append('')
            self.exceptionmessage.append(data.get('exceptiondescription'))
        
        elif message_type == "console":
            while content:
                self.outputmessage.append(content[:APPROXLENOUTPUTLIMIT])
                content = content[APPROXLENOUTPUTLIMIT:]

        elif message_type not in ["editorstatus", "saved", "chat"]:
            logger.info("Unknown record type: %s\n" % str([line[:2000]]))
            
        
        # live update of event output so we can watch it when debugging scraperwiki platform
        # reduce pressure on the server by only updating when we over-run the buffer
        if self.outputmessage and len(self.output) < APPROXLENOUTPUTLIMIT:
            while self.outputmessage:
                self.output = "%s%s" % (self.output, self.outputmessage.pop(0))
                if len(self.output) >= APPROXLENOUTPUTLIMIT:
                    self.output = "%s%s" % (self.output, temptailmessage)
                    self.updaterunobject(False) 
                    break
            self.run_ended = datetime.datetime.now()
            #self.updaterunobject(False)

        while len(self.outputmessage) >= TAIL_LINES:
            discarded = self.outputmessage.pop(0)
            self.discarded_lines += 1
            self.discarded_characters += len(discarded)
コード例 #5
0
 def requestoverduescrapersFailure(self, failure):
     if failure.type == ConnectionRefusedError:
         logger.info("requestoverduescrapers ConnectionRefused")
     else:
         logger.warning("requestoverduescrapers failure received "+str(failure.type))
     failure.trap(ConnectionRefusedError)  # (doesn't do anything as there's no higher level error handling anyway)
コード例 #6
0
    def clientcommand(self, command, parsed_data):
        if command != 'typing':
            logger.debug("command %s client# %d" % (command, self.clientnumber))
        
        # update the lasttouch values on associated aggregations
        if command != 'automode' and self.clienttype == "editing":
            self.clientlasttouch = datetime.datetime.now()
            if self.guid and self.username:
                assert self.username in self.guidclienteditors.usereditormap
                self.guidclienteditors.usereditormap[self.username].userlasttouch = self.clientlasttouch
                self.guidclienteditors.scraperlasttouch = self.clientlasttouch

        # data uploaded when a new connection is made from the editor
        if command == 'connection_open':
            self.lconnectionopen(parsed_data)

                # finds the corresponding client and presses the run button on it
                # receives a single record through the pipeline
        elif command == 'stimulate_run':
            self.clienttype = "stimulate_run"
            self.factory.clientConnectionRegistered(self)  

            scrapername = parsed_data["scrapername"]
            guid = parsed_data["guid"]
            assert guid
            username = parsed_data["username"]
            clientnumber = parsed_data["clientnumber"]

            client = None
            eoos = self.factory.guidclientmap.get(guid)
            if eoos:
                usereditor = eoos.usereditormap.get(username)
                if usereditor:
                    for lclient in usereditor.userclients:
                        if lclient.clientnumber == clientnumber:
                            client = lclient

            if parsed_data.get('django_key') != djangokey:
                logger.error("djangokey_mismatch")
                self.writejson({'status':'twister djangokey mismatch'})
                if client:
                    client.writejson({"message_type":"console", "content":"twister djangokey mismatch"})  
                    client.writejson({'message_type':'executionstatus', 'content':'runfinished'})
                    client = None
            
            if client:
                logger.info("stimulate on : %s %s client# %d" % (client.cchatname, client.scrapername, client.clientnumber))
                assert client.clienttype == "editing" and client.guid
                if not client.processrunning:
                    client.runcode(parsed_data)
                    self.writejson({"status":"run started"})  
                else:
                    client.writejson({"message_type":"console", "content":"client already running"})  
                    self.writejson({"status":"client already running"})  
            else:
                parsed_data.pop("code", None)   # shorten the log message
                logger.warning("client not found %s" % parsed_data)
                self.writejson({"status":"client not found"})  

            self.transport.loseConnection()

        elif command == 'rpcrun':
            self.username = parsed_data.get('username', '')
            self.userrealname = parsed_data.get('userrealname', self.username)
            self.scrapername = parsed_data.get('scrapername', '')
            self.scraperlanguage = parsed_data.get('language', '')
            self.guid = parsed_data.get("guid", '')
            if parsed_data.get('django_key') == djangokey:
                self.clienttype = "rpcrunning"
                logger.info("connection open %s: %s %s client# %d" % (self.clienttype, self.username, self.scrapername, self.clientnumber)) 
                self.factory.clientConnectionRegistered(self)  
                self.runcode(parsed_data)
                    # termination is by the calling function when it receives an executionstatus runfinished message
            else:
                logger.error("djangokey_mismatch")
                self.writejson({'status':'twister djangokey mismatch'})
                self.transport.loseConnection()


        elif command == 'saved':
            line = json.dumps({'message_type' : "saved", 'chatname' : self.chatname})
            otherline = json.dumps({'message_type' : "othersaved", 'chatname' : self.chatname})
            self.guidclienteditors.rev = parsed_data["rev"]
            self.guidclienteditors.chainpatchnumber = 0
            self.writeall(line, otherline)
            self.factory.notifyMonitoringClientsSmallmessage(self, "savenote")


    # should record the rev and chainpatchnumber so when we join to this scraper we know
        elif command == 'typing':
            logger.debug("command %s client# %d insertlinenumber %s" % (command, self.clientnumber, parsed_data.get("insertlinenumber")))
            jline = {'message_type' : "typing", 'content' : "%s typing" % self.chatname}
            jotherline = parsed_data.copy()
            jotherline.pop("command")
            jotherline["message_type"] = "othertyping"
            jotherline["content"] = jline["content"]
            self.guidclienteditors.chainpatchnumber = parsed_data.get("chainpatchnumber")
            self.writeall(json.dumps(jline), json.dumps(jotherline))
            self.factory.notifyMonitoringClientsSmallmessage(self, "typingnote")
            
            # this one only applies to draft scrapers when you click run
        elif command == 'run':
            if self.processrunning:
                self.writejson({'content':"Already running! (shouldn't happen)", 'message_type':'console'}); 
                return 
            if self.username:
                if self.automode == 'autoload':
                    self.writejson({'content':"Not supposed to run! "+self.automode, 'message_type':'console'}); 
                    return 
            
            if parsed_data.get('guid'):
                self.writejson({'content':"scraper run can only be done through stimulate_run method", 'message_type':'console'}); 
                return 

            logger.info("about to run code %s" % str(parsed_data)[:100])
            self.runcode(parsed_data)
        
        elif command == "umlcontrol":
                # allows monitoring client to remotely kill processes
            if self.clienttype != "umlmonitoring":
                logger.error("umlcontrol called by non-monitoring client")
                return
                
            logger.info("umlcontrol %s" % ([parsed_data]))

            subcommand = parsed_data.get("subcommand")
            if subcommand == "killscraper":
                scrapername = parsed_data["scrapername"]
                for eoos in self.factory.guidclientmap.values():   # would be better if it was by scrapername instead of guid
                    if eoos.scrapername == scrapername:
                        for usereditor in eoos.usereditormap.values():
                            for uclient in usereditor.userclients:
                                if uclient.processrunning:
                                    logger.info("umlcontrol killing run on client# %d %s" % (uclient.clientnumber, scrapername))
                                    uclient.kill_run()

            if subcommand == "killallscheduled":
                for client in self.factory.scheduledrunners.values():
                    if client.processrunning:
                       logger.info("umlcontrol killing run on client# %d %s" % (client.clientnumber, client.scrapername))
                       client.kill_run()
                    else:
                        logger.info("umlcontrol client# %d %s wasn't running" % (client.clientnumber, client.scrapername))
                       
            if "maxscheduledscrapers" in parsed_data:
                self.factory.maxscheduledscrapers = parsed_data["maxscheduledscrapers"]
                self.factory.notifyMonitoringClients(None)

        elif command == "kill":
            if self.processrunning:
                self.kill_run()
                
                # allows the killing of a process in another open window by same user
            elif self.username and self.guid:   
                usereditor = self.guidclienteditors.usereditormap[self.username]
                for client in usereditor.userclients:
                    if client.processrunning:
                        client.kill_run()

        elif command == 'chat':
            line = json.dumps({'message_type':'chat', 'chatname':self.chatname, 'message':parsed_data.get('text'), 'nowtime':jstime(datetime.datetime.now()) })
            self.writeall(line)
        
        elif command == 'requesteditcontrol':
            for usereditor in self.guidclienteditors.usereditormap.values():
                for client in usereditor.userclients:
                    if client.automode == 'autosave':
                        client.writejson({'message_type':'requestededitcontrol', "username":self.username})
        
        elif command == 'giveselrange':
            self.writeall(None, json.dumps({'message_type':'giveselrange', 'selrange':parsed_data.get('selrange'), 'chatname':self.chatname }))
            
        elif command == 'automode':
            automode = parsed_data.get('automode')
            if automode == self.automode:
                return

            if not self.username:
                self.automode = automode
                self.factory.notifyMonitoringClients(self)
                return

            usereditor = self.guidclienteditors.usereditormap[self.username]
 
                # self-demote to autoload mode while choosing to promote a particular person to editing mode
            if automode == 'autoload':
                selectednexteditor = parsed_data.get('selectednexteditor')
                if selectednexteditor and selectednexteditor in self.guidclienteditors.usereditormap:
                    assert self.guidclienteditors.usereditormap[selectednexteditor].usersessionpriority >= usereditor.usersessionpriority
                    self.guidclienteditors.usereditormap[selectednexteditor].usersessionpriority = usereditor.usersessionpriority
                usereditor.usersessionpriority = self.guidclienteditors.usersessionprioritynext
                self.guidclienteditors.usersessionprioritynext += 1
            
            self.automode = automode
            
            self.guidclienteditors.notifyEditorClients("")
            self.factory.notifyMonitoringClients(self)

        # this message helps kill it better and killing it from the browser end
        elif command == 'loseconnection':
            # Suspect it is possible in some cases that the client sends this command, and before
            # we have had a chance to close the connection from here, the client has already gone.
            # To cover this case let's handle the exception here and log that loseConnection failed
            try:
                self.transport.loseConnection()
            except: 
                logger.debug('Closing connection on already closed connection failed')