def dataReceived(self, data): # probably should be using LineReceiver lines = (self.bufferclient+data).split("\r\n") self.bufferclient = lines.pop(-1) for lline in lines: line = lline.strip() # handle case where we have an http connection rather than plain socket connection if not self.clienttype and line[:4] == 'GET ': self.clienttype = "httpget" self.factory.clientConnectionRegistered(self) self.httpheaders = [ ] self.httpgetreplied = False if self.clienttype == "httpget": self.handlehttpgetcase(line) # otherwise this is a message coming in from django or the browser editor # (though the django ones could be done with an httppost or something) elif line: try: parsed_data = json.loads(line) except ValueError: logger.warning("Nonparsable command ") logger.info("Bad json parsing: client# %d %s" % (self.clientnumber, str([line[:1000]]))) self.writejson({'content':"Command not json parsable: %s " % line, 'message_type':'console'}) continue if type(parsed_data) != dict or 'command' not in parsed_data: logger.info("Bad json parsing not dict: client# %d %s" % (self.clientnumber, str([line[:1000]]))) self.writejson({'content':"Command not json dict with command: %s " % line, 'message_type':'console'}) continue command = parsed_data.get('command') self.clientcommand(command, parsed_data)
def connectionLost(self, reason): if reason.type in [ResponseDone, PotentialDataLoss]: # sometimes need to print huge amount out to read the html formatted error #logger.debug("updaterunobject response: %s"% str("".join(self.rbuffer)[:50000])) logger.debug("updaterunobject response: %s"% str(["".join(self.rbuffer)[:2000]])) else: logger.warning("nope "+str([reason.getErrorMessage(), reason.type, self.rbuffer])) self.finished.callback(None)
def connectionLost(self, reason): if reason.type in [ResponseDone, PotentialDataLoss]: try: jdata = json.loads("".join(self.rbuffer)) if "language" in jdata: jdata["language"] = jdata["language"].lower() self.factory.requestoverduescrapersAction(jdata) except ValueError: #logger.warning("".join(self.rbuffer)) logger.warning("request overdue bad json: "+str(self.rbuffer)[:1000]+" "+str(reason.type)) else: logger.warning("nope "+str([reason.getErrorMessage(), reason.type, self.rbuffer])) self.finished.callback(None)
class ScheduledRunMessageLoopHandler: def __init__(self, client, username, agent, runid, rev): # a partial implementation of editor.js self.exceptionmessage = [ ] self.completiondata = None self.outputmessage = [ ] self.domainscrapes = { } # domain: { "pages_scraped":0, "bytes_scraped":0 } self.discarded_lines = 0 self.discarded_characters = 0 self.output = "" self.run_ended = None self.upost = {"django_key":djangokey} self.upost["pages_scraped"] = 0 self.upost["records_produced"] = 0 self.upost["scrapername"] = client.scrapername self.upost["clientnumber"] = client.clientnumber self.upost['runID'] = runid self.upost['run_id'] = runid self.upost['revision'] = rev self.username = username self.agent = agent def updaterunobjectFailure(self, failure): # could check for the type and use the retry functionality that exists in twister logger.info("requestoverduescrapers failure received "+str(failure)) def updaterunobjectResponse(self, response): finished = Deferred() response.deliverBody(updaterunobjectReceiver(finished)) return finished def updaterunobject(self, bfinished): url = urlparse.urljoin(djangourl, 'scraper-admin/twistermakesrunevent/') logger.info("attempting to post: "+url) self.upost["output"] = self.output if bfinished: self.upost["exitstatus"] = self.exceptionmessage and 'exceptionmessage' or 'done' self.upost["domainscrapes"] = json.dumps(self.domainscrapes) if self.exceptionmessage: self.upost['exceptionmessage'] = self.exceptionmessage # urllib.urlencode applies str() to each value in the list, which is dumb. # to get a proper error, print some chinese characters # need to get an explanation for this design of urlencode lupost = self.upost.copy() for key in lupost: if type(lupost[key]) == unicode: lupost[key] = lupost[key].encode("utf8") ulupost = urllib.urlencode(lupost) logger.info(self.upost) d = self.agent.request('POST', url, Headers(), StringProducer(ulupost)) d.addCallbacks(self.updaterunobjectResponse, self.updaterunobjectFailure) def receiveline(self, line): try: data = json.loads(line) if not isinstance(data, dict): raise TypeError('Incorrect type of JSON') except Exception, e: logger.debug( "Failed to json.loads() %s %s" % (str([line]), str(e))) return message_type = data.get('message_type') content = data.get("content") if message_type == 'executionstatus': if content == "startingrun": self.output = "%sEXECUTIONSTATUS: uml=%s usename=%s runid=%s\n" % (self.output, data.get("uml"), self.username, data.get("runID")) self.updaterunobject(False) # generated by scriptmanager executor.js elif content == "runcompleted": logger.debug( "Got run completed : %s" % (line,) ) self.completiondata = data self.completionmessage = ''; if data.get('elapsed_seconds'): self.completionmessage += str(data.get("elapsed_seconds")) + " seconds elapsed, " if data.get("CPU_seconds", False): # Until we can get CPU used self.completionmessage += str(data.get("CPU_seconds")) + " CPU seconds used"; if "exit_status" in data and data.get("exit_status") != 0: self.completionmessage += ", exit status " + str(data.get("exit_status")); if "term_sig_text" in data: self.completionmessage += ", terminated by " + data.get("term_sig_text"); elif "term_sig" in data: self.completionmessage += ", terminated by signal " + str(data.get("term_sig")); logger.debug( "Completion status : %s" % (line,) ) # generated by twister after completion elif content == "runfinished": # run object is updated following the schedulecompleted() function call pass else: logger.warning("Unrecognized message: %s %s" % (message_type, content)) elif message_type == "sources": self.upost["pages_scraped"] += 1 # soon to be deprecated url = data.get('url') netloc = "%s://%s" % urlparse.urlparse(url)[:2] if "first_url_scraped" not in self.upost and url and netloc[-16:] != '.scraperwiki.com' and url[-10:] != 'robots.txt': self.upost["first_url_scraped"] = data.get('url') if netloc: if netloc not in self.domainscrapes: self.domainscrapes[netloc] = { "pages_scraped":0, "bytes_scraped":0 } self.domainscrapes[netloc]["pages_scraped"] += 1 self.domainscrapes[netloc]["bytes_scraped"] += int(data.get('bytes')) elif message_type == "data": self.upost["records_produced"] += 1 elif message_type == "sqlitecall": if data.get('insert'): self.upost["records_produced"] += 1 # only one of these ever elif message_type == "exception": self.upost["exception_message"] = data.get('exceptiondescription') for stackentry in data.get("stackdump"): sMessage = stackentry.get('file') if sMessage: if sMessage == "<string>": sMessage = "Line %d: %s" % (stackentry.get('linenumber', -1), stackentry.get('linetext')) if stackentry.get('furtherlinetext'): sMessage += " -- " + stackentry.get('furtherlinetext') self.exceptionmessage.append(sMessage) if stackentry.get('duplicates') and stackentry.get('duplicates') > 1: self.exceptionmessage.append(" + %d duplicates" % stackentry.get('duplicates')) if data.get("blockedurl"): self.exceptionmessage.append("Blocked URL: %s" % data.get("blockedurl")) self.exceptionmessage.append('') self.exceptionmessage.append(data.get('exceptiondescription')) elif message_type == "console": while content: self.outputmessage.append(content[:APPROXLENOUTPUTLIMIT]) content = content[APPROXLENOUTPUTLIMIT:] elif message_type not in ["editorstatus", "saved", "chat"]: logger.info("Unknown record type: %s\n" % str([line[:2000]])) # live update of event output so we can watch it when debugging scraperwiki platform # reduce pressure on the server by only updating when we over-run the buffer if self.outputmessage and len(self.output) < APPROXLENOUTPUTLIMIT: while self.outputmessage: self.output = "%s%s" % (self.output, self.outputmessage.pop(0)) if len(self.output) >= APPROXLENOUTPUTLIMIT: self.output = "%s%s" % (self.output, temptailmessage) self.updaterunobject(False) break self.run_ended = datetime.datetime.now() #self.updaterunobject(False) while len(self.outputmessage) >= TAIL_LINES: discarded = self.outputmessage.pop(0) self.discarded_lines += 1 self.discarded_characters += len(discarded)
def requestoverduescrapersFailure(self, failure): if failure.type == ConnectionRefusedError: logger.info("requestoverduescrapers ConnectionRefused") else: logger.warning("requestoverduescrapers failure received "+str(failure.type)) failure.trap(ConnectionRefusedError) # (doesn't do anything as there's no higher level error handling anyway)
def clientcommand(self, command, parsed_data): if command != 'typing': logger.debug("command %s client# %d" % (command, self.clientnumber)) # update the lasttouch values on associated aggregations if command != 'automode' and self.clienttype == "editing": self.clientlasttouch = datetime.datetime.now() if self.guid and self.username: assert self.username in self.guidclienteditors.usereditormap self.guidclienteditors.usereditormap[self.username].userlasttouch = self.clientlasttouch self.guidclienteditors.scraperlasttouch = self.clientlasttouch # data uploaded when a new connection is made from the editor if command == 'connection_open': self.lconnectionopen(parsed_data) # finds the corresponding client and presses the run button on it # receives a single record through the pipeline elif command == 'stimulate_run': self.clienttype = "stimulate_run" self.factory.clientConnectionRegistered(self) scrapername = parsed_data["scrapername"] guid = parsed_data["guid"] assert guid username = parsed_data["username"] clientnumber = parsed_data["clientnumber"] client = None eoos = self.factory.guidclientmap.get(guid) if eoos: usereditor = eoos.usereditormap.get(username) if usereditor: for lclient in usereditor.userclients: if lclient.clientnumber == clientnumber: client = lclient if parsed_data.get('django_key') != djangokey: logger.error("djangokey_mismatch") self.writejson({'status':'twister djangokey mismatch'}) if client: client.writejson({"message_type":"console", "content":"twister djangokey mismatch"}) client.writejson({'message_type':'executionstatus', 'content':'runfinished'}) client = None if client: logger.info("stimulate on : %s %s client# %d" % (client.cchatname, client.scrapername, client.clientnumber)) assert client.clienttype == "editing" and client.guid if not client.processrunning: client.runcode(parsed_data) self.writejson({"status":"run started"}) else: client.writejson({"message_type":"console", "content":"client already running"}) self.writejson({"status":"client already running"}) else: parsed_data.pop("code", None) # shorten the log message logger.warning("client not found %s" % parsed_data) self.writejson({"status":"client not found"}) self.transport.loseConnection() elif command == 'rpcrun': self.username = parsed_data.get('username', '') self.userrealname = parsed_data.get('userrealname', self.username) self.scrapername = parsed_data.get('scrapername', '') self.scraperlanguage = parsed_data.get('language', '') self.guid = parsed_data.get("guid", '') if parsed_data.get('django_key') == djangokey: self.clienttype = "rpcrunning" logger.info("connection open %s: %s %s client# %d" % (self.clienttype, self.username, self.scrapername, self.clientnumber)) self.factory.clientConnectionRegistered(self) self.runcode(parsed_data) # termination is by the calling function when it receives an executionstatus runfinished message else: logger.error("djangokey_mismatch") self.writejson({'status':'twister djangokey mismatch'}) self.transport.loseConnection() elif command == 'saved': line = json.dumps({'message_type' : "saved", 'chatname' : self.chatname}) otherline = json.dumps({'message_type' : "othersaved", 'chatname' : self.chatname}) self.guidclienteditors.rev = parsed_data["rev"] self.guidclienteditors.chainpatchnumber = 0 self.writeall(line, otherline) self.factory.notifyMonitoringClientsSmallmessage(self, "savenote") # should record the rev and chainpatchnumber so when we join to this scraper we know elif command == 'typing': logger.debug("command %s client# %d insertlinenumber %s" % (command, self.clientnumber, parsed_data.get("insertlinenumber"))) jline = {'message_type' : "typing", 'content' : "%s typing" % self.chatname} jotherline = parsed_data.copy() jotherline.pop("command") jotherline["message_type"] = "othertyping" jotherline["content"] = jline["content"] self.guidclienteditors.chainpatchnumber = parsed_data.get("chainpatchnumber") self.writeall(json.dumps(jline), json.dumps(jotherline)) self.factory.notifyMonitoringClientsSmallmessage(self, "typingnote") # this one only applies to draft scrapers when you click run elif command == 'run': if self.processrunning: self.writejson({'content':"Already running! (shouldn't happen)", 'message_type':'console'}); return if self.username: if self.automode == 'autoload': self.writejson({'content':"Not supposed to run! "+self.automode, 'message_type':'console'}); return if parsed_data.get('guid'): self.writejson({'content':"scraper run can only be done through stimulate_run method", 'message_type':'console'}); return logger.info("about to run code %s" % str(parsed_data)[:100]) self.runcode(parsed_data) elif command == "umlcontrol": # allows monitoring client to remotely kill processes if self.clienttype != "umlmonitoring": logger.error("umlcontrol called by non-monitoring client") return logger.info("umlcontrol %s" % ([parsed_data])) subcommand = parsed_data.get("subcommand") if subcommand == "killscraper": scrapername = parsed_data["scrapername"] for eoos in self.factory.guidclientmap.values(): # would be better if it was by scrapername instead of guid if eoos.scrapername == scrapername: for usereditor in eoos.usereditormap.values(): for uclient in usereditor.userclients: if uclient.processrunning: logger.info("umlcontrol killing run on client# %d %s" % (uclient.clientnumber, scrapername)) uclient.kill_run() if subcommand == "killallscheduled": for client in self.factory.scheduledrunners.values(): if client.processrunning: logger.info("umlcontrol killing run on client# %d %s" % (client.clientnumber, client.scrapername)) client.kill_run() else: logger.info("umlcontrol client# %d %s wasn't running" % (client.clientnumber, client.scrapername)) if "maxscheduledscrapers" in parsed_data: self.factory.maxscheduledscrapers = parsed_data["maxscheduledscrapers"] self.factory.notifyMonitoringClients(None) elif command == "kill": if self.processrunning: self.kill_run() # allows the killing of a process in another open window by same user elif self.username and self.guid: usereditor = self.guidclienteditors.usereditormap[self.username] for client in usereditor.userclients: if client.processrunning: client.kill_run() elif command == 'chat': line = json.dumps({'message_type':'chat', 'chatname':self.chatname, 'message':parsed_data.get('text'), 'nowtime':jstime(datetime.datetime.now()) }) self.writeall(line) elif command == 'requesteditcontrol': for usereditor in self.guidclienteditors.usereditormap.values(): for client in usereditor.userclients: if client.automode == 'autosave': client.writejson({'message_type':'requestededitcontrol', "username":self.username}) elif command == 'giveselrange': self.writeall(None, json.dumps({'message_type':'giveselrange', 'selrange':parsed_data.get('selrange'), 'chatname':self.chatname })) elif command == 'automode': automode = parsed_data.get('automode') if automode == self.automode: return if not self.username: self.automode = automode self.factory.notifyMonitoringClients(self) return usereditor = self.guidclienteditors.usereditormap[self.username] # self-demote to autoload mode while choosing to promote a particular person to editing mode if automode == 'autoload': selectednexteditor = parsed_data.get('selectednexteditor') if selectednexteditor and selectednexteditor in self.guidclienteditors.usereditormap: assert self.guidclienteditors.usereditormap[selectednexteditor].usersessionpriority >= usereditor.usersessionpriority self.guidclienteditors.usereditormap[selectednexteditor].usersessionpriority = usereditor.usersessionpriority usereditor.usersessionpriority = self.guidclienteditors.usersessionprioritynext self.guidclienteditors.usersessionprioritynext += 1 self.automode = automode self.guidclienteditors.notifyEditorClients("") self.factory.notifyMonitoringClients(self) # this message helps kill it better and killing it from the browser end elif command == 'loseconnection': # Suspect it is possible in some cases that the client sends this command, and before # we have had a chance to close the connection from here, the client has already gone. # To cover this case let's handle the exception here and log that loseConnection failed try: self.transport.loseConnection() except: logger.debug('Closing connection on already closed connection failed')