def fogbugzOnFail(self,logfp): print "Creating FogBuz Ticket" cfp=Config(self.__fpath) attempts=0 run=True while run is True and attempts < 3: try: site=FogBugz(cfp.getVar("fogbugz","site","string")) try: site.logon(cfp.getVar("fogbugz","user","string"), cfp.getVar("fogbugz","pass","string")) cfp=Config(self.__fpath) with open(logfp,'rb') as fp: print site.new(sTitle="The Parser "+os.path.join(self.__logbase,self.__execute)+" Failed",ixPersonAssignedTo="Andy",Files={"faillog.txt":fp}) attempts+=1 run=False except Exception,e: print str(e) for frame in traceback.extract_tb(sys.exc_info()[2]): print '\n'.join([str(x) for x in frame]) finally: site.logoff() except Exception,e: print str(e) for frame in traceback.extract_tb(sys.exc_info()[2]): print '\n'.join([str(x) for x in frame])
class ProxyManager(object): def __init__(self,fpath): self.__fpath = fpath if self.__fpath is None or os.path.exists(self.__fpath) is False: raise ConfigurationNotFoundException("Valid Configuration must be Generated") self._cfp = Config(self.__fpath) def getProxies(self,number,domain): return json.loads(urllib2.urlopen(urlparse.urljoin(self._cfp.getVar("proxymanager", "server", "String").strip(),"getProxy?auth=&number={}&domain={}".format(number,domain.strip())), None, timeout = 120).read()) def dropProxy(self,domain,ip): return json.loads(urllib2.urlopen(urlparse.urljoin(self._cfp.getVar("proxymanager", "server", "String").strip(),"dropProxyForJson?auth=&domain={}&ip={}".format(domain.strip(),ip.strip())), None, timeout = 120).read()) def dropDomain(self,domain): return json.loads(urllib2.urlopen(urlparse.urljoin(self._cfp.getVar("proxymanager", "server", "String").strip(),"dropDomainForJsony?auth=&domain={}".format(domain.strip())), None, timeout = 120).read())
def run(self): """ Executes the crawler as a separate process and monitors for completion. The worker itself is a Thread so run is the necessary name of the method. """ print "Executing "+self.__execute p=Psyco(self.__fpath) cfp=Config(self.__fpath) if self.__execute is not None and self.__logbase is not None: try: logfp=self.__logbase+"logs/"+self.__execute.replace(".xml","").replace(".xml","")+str(int(round(time.time())))+".txt" self.__sema.increment(self.__lock) try: p.execute("INSERT INTO crawlers.ca_crim_parsers_running(name) VALUES('"+self.__execute+"')") p.execute("DELETE FROM crawlers.ca_crim_parsers_not_run WHERE name LIKE '"+self.__execute+"'") stdfile=open(logfp,'w+') #the products config file will be in the base directory cmd="/usr/bin/java -Xms"+cfp.getVar("Pentaho","xms","string").strip()+" -Xmx"+cfp.getVar("Pentaho","xmx","string").strip()+" -XX:+UseConcMarkSweepGC -Xcompactexplicitgc -Dbeansfile="+self.__logbase+self.__execute+" -jar "+self.__jarfile print cmd pipe=subprocess.Popen(shlex.split(cmd), stdout=stdfile,stderr=subprocess.STDOUT,shell=False) ret=pipe.wait() print "Completed "+self.__execute p.execute("DELETE FROM crawlers.ca_crim_parsers_running WHERE name LIKE '"+self.__execute+"'") if ret is 0: p.execute("INSERT INTO crawlers.ca_crim_parsers_complete (name,crawlstamp) VALUES('"+self.__execute+"','"+str(self.__datestamp)+"')") else: print "PARSERS- Premature Detonation!!! Failure "+self.__execute print str(ret) if cfp.getVar("notification", "err_type","string") == 'fog': self.fogbugzOnFail(logfp) else: self.emailOnFail(logfp) finally: print "Decrementing" self.__sema.decrement(self.__lock) except Exception, e: print "Process Failed" print str(e) for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print "Error in %s on line %d" % (fname, lineno) print "Worker Complete "+str(time.time())
def emailOnFail(self,logfp): """ Email on a failed crawler. Sender should not be internal for outlook purposes. *Required Parameter* :param logfp: log file path """ print "Sending Alert Message" cfp=Config(self.__fpath) #get log as attachment att="" with open(logfp,'r') as fp: att=fp.read() #prep body=MIMEMultipart('alternative') body.attach(MIMEText("PARSER - Premature Detonation!!! Failed or Terminated Parser: "+os.path.join(self.__logbase,self.__execute))) msg=MIMEText("The Parser "+os.path.join(self.__logbase,self.__execute)+" Failed") msg=MIMEMultipart() msg.attach(body) msg['From']=cfp.getVar("mail","sender","string") msg['To']=cfp.getVar("mail","recipients","string") msg['Subject']="Failed CACRIM Parser" part=MIMEBase('application',"octet-stream") part.set_payload(att) Encoders.encode_base64(part) part.add_header('Content-Disposition','attachment',filename="faillog.txt") #attach msg.attach(part) #send mailobj=smtplib.SMTP(cfp.getVar("mail","host","string")) mailobj.sendmail(cfp.getVar("mail", "sender","string"),[cfp.getVar("mail","recipients","string").split(",")],msg.as_string()) mailobj.quit() print "Mail Sent"
def setup(self,folder,table,confPath,createImages=False): ''' Setup status table and folder table. *Required Parameters* :param folder: the folder name to use :param table: the table to use in setting up the status :param confPath: the configuration path *Optional Parameters* :param createImages: whether or not to create an images folder ''' cfp=Config(confPath) os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder) p=Psyco(confPath) if createImages is True: #update crawl specific tables p.execute("INSERT INTO folders.names (folder,schema) VALUES ('"+folder+"','us_fl_crawlsor')") os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder) os.mkdir("Images","fpath","string"+"FL/SOR/"+folder+"/Images/")
def changeOnComplete(self,alterSchema,confPath,folder=None): ''' Handles the final cleanup on the last part of the crawl. *Required Parameters* :param alterSchema: the schema to rename and use :param confPath: the configuration path to use *Optional Parameters* :param folder: folder name to use NOTE: The folder name should be the saem as provided from getFolder and/or to setup if used. ''' cfp=Config(confPath) p=Psyco(confPath) rename="us_fl_crawlsor"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S') print("Renaming Schema to "+rename) p.execute("ALTER SCHEMA us_fl_crawlsor RENAME TO "+rename) if folder is not None: #update the folder name information p.execute("UPDATE folders.names SET status=true,schema="+rename+" WHERE folder="+folder) p.execute("INSERT INTO "+cfp.getVar("finished","alterschema","string")+"."+cfp.getVar("finished","altertable","string")+"(filename,schema) VALUES('"+re.search('.*?\/([^\/]+\.py.*)',inspect.getfile(inspect.currentframe())).group(1)+"','"+rename+"')")
class Errors: """ Errors is the class that handles all error processing. """ # empty constructor def __init__(self, fpath=None, logger=None): """ *Optional Parameters* :param fpath: The file path to use. :param logger: The logger to use. """ self._errors = {} self._fpath = fpath self._cfp = None self.errCo = -1 if self._fpath is not None: self._cfp = Config(self._fpath) self.logger = logger @property def errCo(self): return self.__errco @errCo.setter def errCo(self, errCo): self.__errCo = errCo def printError(self, msg): """ General Error Printing that checks for a log (Python2.7 cannot override print) :param msg: The message """ if self.logger is not None: self.logger.error(msg) else: print (msg) # print traceback frames def printFrames(self, frames): """ Prints traceback frames. :param frames: The exception frames. """ for frame in frames: fname, lineno, fn, text = frame self.printError("Error in %s on line %d" % (fname, lineno)) def postToErrorTables(self, sourcename, e, frames): """ Takes in traceback frames and posts them to database. The expectation is that the data collected here is useful in anomoly detection. An error code of -1 specifies that an attribute has no code. This is a property that can be changed :param sourcename: The name of the source. :param frames: The traceback frames from the error. :param e: The error, Exception instance. """ code = self.__errCo description = None etype = None etype = str(type(e)) if "HTTPError" in etype: code = e.code description = str(e) elif hasattr(e, "errno") is True: code = e.errno if "URLError" in etype: description = e.reason() else: description = str(e) if sourcename is None: self.printError("Please Specify a Source Name") else: if self._fpath is not None and self._cfp is not None and type is not None and description is not None: p = Psyco(self._fpath) p.execute( "INSERT INTO " + self._cfp.getVar("Error", "table", "string") + " (source,type,code,description) VALUES('" + sourcename + "','" + etype + "','" + str(code) + "','" + description + "')" ) elif type is None or description is None: self.printError("Type or Description Not Found for the Error. Cannot insert To Database.") else: self.printError("Please Specify a Config Path") # handle http error def httpError(self, e, link, infailcodes, inkillswitch, incurrpos, frames): """ Handles httpErrors *Required Parameters* :param e: error instance :param link: url address when the error occurred :param infailcodes: fail codes for the url :param inkillswitch: number of failures to the current point :param incurrpos: current position in the link list on fail :param frames: the traceback frames at failure """ currpos = incurrpos killswitch = inkillswitch failcodes = infailcodes print "Failed to Get Page " + link if "URLError" in str(type(e)): self.printError("URLError") print e.reason failcodes += str(e.reason) + "," elif "HTTPError" in str(type(e)): self.printError("HTTPError") print e.code failcodes += str(e.code) + "," else: self.printError("Exception Error") print str(e) failcodes += str(e) + "," if killswitch < 5: killswitch += 1 else: self.printError("Critical Failure for " + link) ks = 0 currpos += 1 self.printFrames(frames) return (failcodes, killswitch, currpos) # crawlersetup failure def crawlerSetupError(self, e, frames): """ Handles failures in crawler setup while taking in the error instance and traceback frames. """ self.printError("Crawler Setup Failed") self.error(e, frames) # crawler process failure other than in the pull def crawlerFail(self, e, frames, restart): """ Handles a failure in the crawler. :param e: The exception. :param frames: The frames to use. :param restart: Whether or not the process will restart. """ print "Failed to Get Pages." if restart is True: self.printError("Will Restart This Iteration") self.error(e, frames) # Prints an error without a header def error(self, e, frames): """ General error method for handling error instances and traceback frames. :param e: The exception. :param frames: The exception frames. """ self.printError(str(e)) self.printFrames(frames)
def parse(self): """ Run the parser. """ cfp=Config(self.__fpath) runlist=[] files=self.getXmlFiles(cfp.getVar("base", "basedir","string")) print "Files Found: ", print sorted(files) #get completed crawlers and parsers p=Psyco(self.__fpath) data=p.getData("SELECT q1.name,q1.stamp FROM (SELECT * FROM (SELECT name,max(date) as stamp FROM crawlers.ca_crim_weekly_complete GROUP BY name) as q1 UNION (SELECT name,max(date) FROM crawlers.ca_crim_monthly_complete GROUP BY name UNION SELECT name,max(date) FROM crawlers.ca_crim_daily_complete GROUP BY name)) as q1 LEFT OUTER JOIN (SELECT name,crawlstamp FROM crawlers.ca_crim_parsers_complete) as q2 ON q1.stamp = q2.crawlstamp AND regexp_replace(q1.name,'.py','') = regexp_replace(q2.name,'.xml','') WHERE q2.name IS NULL") nilList=[] for xfile in data: if re.sub('\.py.*','',xfile[0])+".xml" not in files: nilList.append(re.sub('\.py.*','',xfile[0])+".xml") if len(nilList) >0: print "Files Do Not Exist for the Following (the name but not the extension must match)", print sorted(nilList) else: print "All Crawlers Have Corresponding Parsers" #get datestamp dict -- assumes that the query gets the max(date) dates={} for xfile in data: fp=re.sub('\.py.*','',xfile[0])+".xml" if fp not in dates: dates[fp]=xfile[1] for xfile in data: if xfile is not None: fp=xfile[0] if fp.replace('.py','.xml') in files: runlist.append(fp.replace('.py','.xml')) print "Execution List: ", print sorted(runlist) nilList=[] for xfile in data: if re.sub('\.py.*','',xfile[0])+".xml" not in files: nilList.append(re.sub('\.py.*','',xfile[0])+".xml") if len(nilList) >0: print "Parsers that may not have been Found", print sorted(nilList) else: print "All Completed and Found crawlers Accounted For" #run the crawlers from the run list if len(runlist) > 0: pnum=0 pool=[] #get the semaphore and lock sema=IncSemaphore() lock=Lock() #max processes maxprocs=cfp.getVar("opts","maxdel","int") print "Max Processes: "+str(maxprocs) #run cfp=Config(self.__fpath) while len(runlist) >0: if pnum<maxprocs and len(runlist)>0 and runlist[0] in files: ex=runlist.pop() print "Starting "+ex w=Worker(self.__fpath,ex.strip(),cfp.getVar("base","logbase","string"),cfp.getVar("db", "dbname","string"),cfp.getVar("db", "user","string"),cfp.getVar("db", "passw","string"),cfp.getVar("db", "host","string"), cfp.getVar("db", "port","string"),sema,lock,dates[ex],cfp.getVar("jars","parser","string")) w.start() pool.append(w) pnum+=1 print "Continuing" while sema.getValue() >= maxprocs and len(pool) > 0: print "Waiting for Parsers to Complete" time.sleep(random.randint(1,120)) for proc in pool: if not proc.isAlive(): pool.remove(proc) del proc pnum-=1 if len(pool) is 0 and sema.getValue() >0: sema.setValue(0, lock) gc.collect() del gc.garbage[:] for proc in pool: if not proc.isAlive(): pool.remove(proc) del proc pnum-=1 if len(pool) is 0 and sema.getValue() >0: sema.setValue(0, lock) gc.collect() del gc.garbage[:] if sema.getValue()==0 and len(runlist)==0: break time.sleep(30) print "Completed Loop. Awaiting any final Completions." gc.collect() del gc.garbage[:] while sema.getValue()>0 and len(pool)>0: print "Waiting for Completion" time.sleep(random.randint(1,120)) if len(pool) is 0 and sema.getValue() >0: sema.setValue(0, lock) print "Current Execution List Complete. Will Restart Soon!"
def start(self): """ Start the controller. The controller is not multi-processed. Paths are configured, executables obtained, workers run; logs compressed from here. """ #get configuration file cfp=Config(self.__fpath) if not os.path.isdir(cfp.getVar("base","basedir","string")+"logs"): os.mkdir(cfp.getVar("base","basedir","string")+"logs") #setup variables maxprocs=cfp.getVar("opts", "maxdel","int") #the threadpool pool=[] #set up db p=Psyco(cfp.getVar("db", "dbname","string"),cfp.getVar("db", "user","string"),cfp.getVar("db", "passw","string"),cfp.getVar("db", "host","string"), cfp.getVar("db", "port","string")) p.prep() p.execute("CREATE TABLE IF NOT EXISTS data.failedurls(source text,url text)") prioritylist=[] if cfp.getVar("priorities","usepriority","int") is 1: prioritylist=cfp.getVar("priorities","priority_list","list") else: prioritylist=[x[0] for x in p.getData("SELECT name FROM crawlers.sor_complete WHERE name NOT ILIKE '%ParserController%' ORDER BY date ASC")] #get the python files #get the semaphore and lock sema=IncSemaphore() lock=Lock() #the loop run=True print "Starting Crawlers" print "MAX: "+str(maxprocs) procnum=0 print "Base Directory is "+os.path.split(inspect.stack()[0][1])[0] while run is True: execs=self.cleanUp(self.getPyFiles(os.path.split(inspect.stack()[0][1])[0]),cfp.getVar("restart","waitdays","string"),p) execs=list(set(execs)) execs=[x for x in execs if x not in prioritylist] prioritylist.extend(execs) execs=prioritylist print "Executables: "+str(execs) startnew=maxprocs-sema.getValue() while len(pool) >0 or procnum is 0 and len(execs)>0: print "Processes Left "+str(startnew) print "Number of Running Processes "+str(sema.getValue()) for proc in pool: if not proc.isAlive(): pool.remove(proc) del proc procnum-=1 execs=self.repopulate(execs,cfp.getVar("restart","waitdays","int"),p,maxprocs-sema.getValue(),cfp.getVar("base","basedir","string")) execs=list(set(self.cleanUp(self.getPyFiles(cfp.getVar("base","basedir","string")),cfp.getVar("restart","waitdays","string"),p))) if len(pool) is 0 and sema.getValue() >0: sema.setValue(0, lock) print "Pnum: "+str(procnum) if procnum < len(execs) and sema.getValue() < maxprocs: if float(psutil.swap_memory()[3])>2.0 and float(psutil.cpu_times_percent(interval=1,percpu=False)[4])>10.0: print "Resources Low! Waiting for resources to free." print "SMEM: "+str(psutil.swap_memory()[3]) print "IOWait"+str(float(psutil.cpu_times_percent(interval=1,percpu=False)[4])) run=True while run is True: avgsmem=0 avgiowait=0 avgmem=0 for i in range (0,32): time.sleep(.5) avgsmem+=psutil.swap_memory()[3] avgmem+=psutil.virtual_memory()[2] avgiowait+=float(psutil.cpu_times_percent(interval=1,percpu=False)[4]) avgsmem/=32 avgmem/=32 avgiowait/=32 print "Avg. Swap Space: "+str(avgsmem) print "Avg. Mem: "+str(avgmem) print "Avg. IO Wait: "+str(avgiowait) if avgiowait<10 and avgsmem<2: print "Resource Level Acceptable. Continuing!" run=False else: print "Resources Low! Waiting for resources to free." try: ex=execs[procnum] print "Starting "+ex w=Worker(self.__fpath,ex.strip(),cfp.getVar("base","logbase","string"),cfp.getVar("db", "dbname","string"),cfp.getVar("db", "user","string"),cfp.getVar("db", "passw","string"),cfp.getVar("db", "host","string"), cfp.getVar("db", "port","string"),sema,lock) w.start() pool.append(w) time.sleep(5) procnum+=1 execs.remove(ex) except Exception, e: print "Failed to Start a Crawler" print "Crawler Was: "+ex print str(e) for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print "Error in %s on line %d" % (fname, lineno) elif sema.getValue()>0: print "Waiting for Crawlers to Complete "+str(sema.getValue()) print "Waiting To Start ", print execs time.sleep(30) startnew=maxprocs-sema.getValue()