Esempio n. 1
0
 def getFolder(self,folder,schema,confPath,checkTable="folders.names",setup=True,createImages=False):
     '''
     Get the Folder name from the database. Set up the directory
     as needed. Default is to set up the directory.
     
          
     *Required Parameters*
     :param folder: folder name to use in searching for existing folders (e.g. ILIKE '%${folder}%')
     :param schema: schema to pull from 
     :param confPath: path to configuration file with database information
     
     *Optional
     :param setup: whether to setup the 
     :param createImages: whether or not to create an Images directory
     :param checkTable: the table to use in checking status and performing operations
     '''
     cfp=Config(confPath)
     p=Psyco(confPath)
     #get any 'running' folder names
     folder=[x[0] for x in p.getData("SELECT folder FROM folders.names WHERE folder ILIKE '"+folder+"' AND status IS false")]
     if folder is None or len(folder) is 0:
         #generate schema name AND update
         folder="FLCrawl"+datetime.datetime.fromtimestamp(time.time()).strftime("%m%d%Y")
         if setup is True:
             setup(folder,"folders.names")
     else:
         folder=folder[0]
         
     return folder
Esempio n. 2
0
    def calculateRegression(self, fpath):
        """
        Calculates regression equations for each column in each table.
        Proceeds to analyze the current statistics first to see if they
        fail a cutoff threshold and then to see if they can be predicted
        within acceptable uncertainty (standard deviation about the regression line).
        
        Returns True if all columns pass and False if they do not.
        
        
        :param fpath: The file path to use.
        """
        tableRegressions = {}
        result = False
        p = Psyco(fpath)

        # get tables, check against minCount (will return true if mincount is low but will also send alert message)
        tables = p.getData(
            "SELECT distinct on(table_name) table_schema,table_name FROM information_schema.tables WHERE table_schema ILIKE '%"
            + self.__schema
            + "%'"
        )
        tdict = {}
        schemas = [x[0] for x in tables]
        for table in tables:
            tdict[table[1]] = {}

        # get columns and counts, if anything is extraneous (send email and return False)
        columnCounts = {}
        results = {}
        for k in tdict.keys():
            for schema in schemas:
                try:
                    queryCols = [
                        x[0]
                        for x in p.getData(
                            "SELECT distinct(column_name) FROM information_schema.columns WHERE table_schema ILIKE '"
                            + schema
                            + "' AND table_name ILIKE '"
                            + k
                            + "'"
                        )
                    ]
                    sql = p.getData(
                        "SELECT " + ",".join([str("count(" + x + ")") for x in queryCols]) + " FROM " + schema + "." + k
                    )

                    # set the table data points or add to existing data points
                    if len(queryCols) is len(sql[0]):
                        pass

                except Exception, e:
                    print "Failure to Find a Column in a Table or Error in Returned Values"
                    self.__err.error(e, traceback.extract_tb(sys.exc_info()[2]))
Esempio n. 3
0
    def postToErrorTables(self, sourcename, e, frames):
        """
        Takes in traceback frames and posts them to database. The
        expectation is that the data collected here is useful in 
        anomoly detection.
        
        An error code of -1 specifies that an attribute has no code. This is a property that can be changed
        
        :param sourcename: The name of the source.
        :param frames: The traceback frames from the error.
        :param e: The error, Exception instance.
        """
        code = self.__errCo
        description = None
        etype = None

        etype = str(type(e))

        if "HTTPError" in etype:
            code = e.code
            description = str(e)
        elif hasattr(e, "errno") is True:
            code = e.errno

        if "URLError" in etype:
            description = e.reason()
        else:
            description = str(e)

        if sourcename is None:
            self.printError("Please Specify a Source Name")

        else:
            if self._fpath is not None and self._cfp is not None and type is not None and description is not None:
                p = Psyco(self._fpath)
                p.execute(
                    "INSERT INTO "
                    + self._cfp.getVar("Error", "table", "string")
                    + " (source,type,code,description) VALUES('"
                    + sourcename
                    + "','"
                    + etype
                    + "','"
                    + str(code)
                    + "','"
                    + description
                    + "')"
                )
            elif type is None or description is None:
                self.printError("Type or Description Not Found for the Error. Cannot insert To Database.")
            else:
                self.printError("Please Specify a Config Path")
 def run(self):
     """
     Executes the crawler as a separate process and monitors for completion. The worker itself is a Thread so run is the 
     necessary name of the method.
     """
     print "Executing "+self.__execute
     p=Psyco(self.__fpath)
     cfp=Config(self.__fpath)
     
     if self.__execute is not None and self.__logbase is not None:
         try:
             logfp=self.__logbase+"logs/"+self.__execute.replace(".xml","").replace(".xml","")+str(int(round(time.time())))+".txt"
             self.__sema.increment(self.__lock)
             try:
                 p.execute("INSERT INTO crawlers.ca_crim_parsers_running(name) VALUES('"+self.__execute+"')")
                 p.execute("DELETE FROM crawlers.ca_crim_parsers_not_run WHERE name LIKE '"+self.__execute+"'")
                 stdfile=open(logfp,'w+')
                 
                 #the products config file will be in the base directory
                 cmd="/usr/bin/java -Xms"+cfp.getVar("Pentaho","xms","string").strip()+" -Xmx"+cfp.getVar("Pentaho","xmx","string").strip()+" -XX:+UseConcMarkSweepGC -Xcompactexplicitgc -Dbeansfile="+self.__logbase+self.__execute+" -jar "+self.__jarfile
                 print cmd
                 pipe=subprocess.Popen(shlex.split(cmd), stdout=stdfile,stderr=subprocess.STDOUT,shell=False)
                 ret=pipe.wait()
                     
                 print "Completed "+self.__execute
                 p.execute("DELETE FROM crawlers.ca_crim_parsers_running WHERE name LIKE '"+self.__execute+"'")
                 if ret is  0:
                     p.execute("INSERT INTO crawlers.ca_crim_parsers_complete (name,crawlstamp) VALUES('"+self.__execute+"','"+str(self.__datestamp)+"')")
                 else:
                     print "PARSERS- Premature Detonation!!! Failure "+self.__execute
                     print str(ret)
                     if cfp.getVar("notification", "err_type","string") == 'fog':
                         self.fogbugzOnFail(logfp)
                     else:
                         self.emailOnFail(logfp)
                     
             finally:
                 print "Decrementing"
                 self.__sema.decrement(self.__lock)
         except Exception, e:
             print "Process Failed"
             print str(e)
             for frame in traceback.extract_tb(sys.exc_info()[2]):
                 fname,lineno,fn,text = frame
                 print "Error in %s on line %d" % (fname, lineno)
         print "Worker Complete "+str(time.time())
Esempio n. 5
0
 def run(self):
     p=Psyco(self.__fpath)
     crawler=GetPage(self.__proxy)
     opener=crawler.getOpener()
     
     html=opener.open("http://www.nationalsecurity.gov.au/Listedterroristorganisations/Pages/default.aspx",None,120).read()
     try:
         spynMap={
         }
         
         confMap={
             "fpath":self.__fpath
         }
         
         cats={
             "targets":"/home/aevans/Documents/cats/terror/targets.txt",
             "activities":"/home/aevans/Documents/cats/terror/activities.txt",
             "attacks":"/home/aevans/Documents/cats/terror/attacks.txt",
             "finance":"/home/aevans/Documents/cats/terror/finance.txt",
             "charges":"/home/aevans/Documents/cats/terror/charges.txt",
             "convictions":"/home/aevans/Documents/cats/terror/convictions.txt",
             "risk":"/home/aevans/Documents/cats/terror/risk.txt",
             "leadership":"/home/aevans/Documents/cats/terror/leadership.txt",
             "background":"/home/aevans/Documents/cats/terror/background.txt",
             "disclaimer":"/home/aevans/Documents/cats/terror/disclaimer.txt",
             "family":"/home/aevans/Documents/cats/terror/family.txt",
             "noninfo":"/home/aevans/Documents/cats/terror/noninfo.txt",
             "recruitment":"/home/aevans/Documents/cats/terror/nrecruitment.txt"
         }
         
         parseMap=[{
            "class":Parser(cats),
            "passSpynner":False
         }]
         
         pages,faillist=crawler.loop(html, linktag="a", linkattr="href", linklimiters={"href":re.compile("www.nationalsecurity.gov.au\/Listedterroristorganisations\/")}, pageId=self.__pid, maxproxies=self.__maxproxies, spynnerMap=spynMap, opener=opener, waitload=120,proxy=self.__proxy, hashName="hash", table="au_parse_test.html", test=False, printTest=False, faillist=[], database=p, cfpMap=confMap, parserClassMap=parseMap, commitSize=100, checkLinks=True)
         p.execute("INSERT INTO au_parse_test.terms VALUES('finished')")
     except Exception,e:
         self.__err.crawlerFail(e,traceback.extract_tb(sys.exc_info()[2]), True)
Esempio n. 6
0
 def setup(self,folder,table,confPath,createImages=False):
     '''
     Setup status table and folder table.
          
     *Required Parameters*
     :param folder: the folder name to use
     :param table: the table to use in setting up the status
     :param confPath: the configuration path
     
     *Optional Parameters*
     :param createImages: whether or not to create an images folder
     '''
     cfp=Config(confPath)
     os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder)
     
     p=Psyco(confPath)
     
     if createImages is True:
         #update crawl specific tables
         p.execute("INSERT INTO folders.names (folder,schema) VALUES ('"+folder+"','us_fl_crawlsor')")
         os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder)
         os.mkdir("Images","fpath","string"+"FL/SOR/"+folder+"/Images/")
Esempio n. 7
0
 def changeOnComplete(self,alterSchema,confPath,folder=None):
     '''
     Handles the final cleanup on the last part of the crawl.
     
     *Required Parameters*
     :param alterSchema: the schema to rename and use
     :param confPath: the configuration path to use 
     
     *Optional Parameters*
     :param folder: folder name to use
     
     NOTE: The folder name should be the saem as provided from getFolder and/or to setup if used.
     '''
     cfp=Config(confPath)
     p=Psyco(confPath)
     rename="us_fl_crawlsor"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')
     print("Renaming Schema to "+rename)
     p.execute("ALTER SCHEMA us_fl_crawlsor RENAME TO "+rename)
     
     if folder is not None:
         #update the folder name information
         p.execute("UPDATE folders.names SET status=true,schema="+rename+" WHERE folder="+folder)
     p.execute("INSERT INTO "+cfp.getVar("finished","alterschema","string")+"."+cfp.getVar("finished","altertable","string")+"(filename,schema) VALUES('"+re.search('.*?\/([^\/]+\.py.*)',inspect.getfile(inspect.currentframe())).group(1)+"','"+rename+"')")
    def parse(self):
        """
        Run the parser.
        """
        cfp=Config(self.__fpath)
        runlist=[]
        
        files=self.getXmlFiles(cfp.getVar("base", "basedir","string"))
        print "Files Found: ",
        print sorted(files)
        
        #get completed crawlers and parsers
        p=Psyco(self.__fpath)
        data=p.getData("SELECT q1.name,q1.stamp FROM (SELECT * FROM (SELECT name,max(date) as stamp FROM crawlers.ca_crim_weekly_complete GROUP BY name) as q1 UNION (SELECT name,max(date) FROM crawlers.ca_crim_monthly_complete GROUP BY name UNION SELECT name,max(date) FROM crawlers.ca_crim_daily_complete GROUP BY name)) as q1 LEFT OUTER JOIN (SELECT name,crawlstamp FROM crawlers.ca_crim_parsers_complete) as q2 ON q1.stamp = q2.crawlstamp AND regexp_replace(q1.name,'.py','') = regexp_replace(q2.name,'.xml','') WHERE q2.name IS NULL")      
        
        
        nilList=[]
        for xfile in data:
            if re.sub('\.py.*','',xfile[0])+".xml" not in files:
                nilList.append(re.sub('\.py.*','',xfile[0])+".xml") 
                
        if len(nilList) >0:
            print "Files Do Not Exist for the Following (the name but not the extension must match)",
            print sorted(nilList)
        else:
            print "All Crawlers Have Corresponding Parsers"
        
        #get datestamp dict -- assumes that the query gets the max(date)
        dates={}
        for xfile in data:
            fp=re.sub('\.py.*','',xfile[0])+".xml"
                     
            if fp not in dates:
                dates[fp]=xfile[1]

        
        for xfile in data:
            if xfile is not None:
                fp=xfile[0]
                
                if fp.replace('.py','.xml') in files:
                    runlist.append(fp.replace('.py','.xml'))
                   
        print "Execution List: ",
        print sorted(runlist)
        
        nilList=[]
        for xfile in data:
            if re.sub('\.py.*','',xfile[0])+".xml" not in files:
                nilList.append(re.sub('\.py.*','',xfile[0])+".xml") 
        
        if len(nilList) >0:
            print "Parsers that may not have been Found",
            print sorted(nilList)
        else:
            print "All Completed and Found crawlers Accounted For"

        #run the crawlers from the run list
        if len(runlist) > 0:
            pnum=0
            pool=[]
            #get the semaphore and lock
            sema=IncSemaphore()
            lock=Lock()
            
            #max processes
            maxprocs=cfp.getVar("opts","maxdel","int")
            print "Max Processes: "+str(maxprocs)
            #run
            cfp=Config(self.__fpath)
            while len(runlist) >0:
        
                if pnum<maxprocs and len(runlist)>0 and runlist[0] in files:
                    ex=runlist.pop()
                    print "Starting "+ex
                    w=Worker(self.__fpath,ex.strip(),cfp.getVar("base","logbase","string"),cfp.getVar("db", "dbname","string"),cfp.getVar("db", "user","string"),cfp.getVar("db", "passw","string"),cfp.getVar("db", "host","string"), cfp.getVar("db", "port","string"),sema,lock,dates[ex],cfp.getVar("jars","parser","string"))
                    w.start()
                    pool.append(w)
                    pnum+=1
                    print "Continuing"
                
                    
                while sema.getValue() >= maxprocs and len(pool) > 0:
                    print "Waiting for Parsers to Complete"
                    time.sleep(random.randint(1,120))
                    
                    for proc in pool:
                        if not proc.isAlive():
                            pool.remove(proc)
                            del proc
    
                            pnum-=1
                        
                            if len(pool) is 0 and sema.getValue() >0:
                                sema.setValue(0, lock)
                            
                            gc.collect()
                            del gc.garbage[:]
                
                for proc in pool:
                    if not proc.isAlive():
                        pool.remove(proc)
                        del proc

                        pnum-=1
                    
                        if len(pool) is 0 and sema.getValue() >0:
                            sema.setValue(0, lock)
                        
                        gc.collect()
                        del gc.garbage[:]
                
                if sema.getValue()==0 and len(runlist)==0:
                    break
            time.sleep(30)
            
            print "Completed Loop. Awaiting any final Completions."
            gc.collect()
            del gc.garbage[:]
            
            while sema.getValue()>0 and len(pool)>0:
                print "Waiting for Completion"
                time.sleep(random.randint(1,120))
            
            if len(pool) is 0 and sema.getValue() >0:
                    sema.setValue(0, lock)
            
                
                
            print "Current Execution List Complete. Will Restart Soon!"
            time.sleep(30)
            
            print "Completed Loop. Awaiting any final Completions."
            gc.collect()
            del gc.garbage[:]
            
            while sema.getValue()>0 and len(pool)>0:
                print "Waiting for Completion"
                time.sleep(random.randint(1,120))
            
            if len(pool) is 0 and sema.getValue() >0:
                    sema.setValue(0, lock)
            
                
                
            print "Current Execution List Complete. Will Restart Soon!"
                        
if __name__ == '__main__':
    fpath=sys.argv[1]
    #fpath="/home/aevans/Documents/workspace-sts-3.5.1.RELEASE/CACIVIL/Parsers/Conf/test.conf"
    
    p=Psyco(fpath)
    drops=[]
    creates=[]
    creates.append("CREATE TABLE IF NOT EXISTS crawlers.ca_crim_parsers_not_run (name text, date timestamp default current_timestamp)")
    creates.append("CREATE TABLE IF NOT EXISTS crawlers.ca_crim_parsers_running (name text, date timestamp default current_timestamp)")
    creates.append("CREATE TABLE IF NOT EXISTS crawlers.ca_crim_parsers_complete (name text, crawlstamp timestamp, date timestamp default current_timestamp)")
    p.prep(drops, creates)
    
    c=Controller(fpath)
    c.parse()