Esempio n. 1
0
    def calculateRegression(self, fpath):
        """
        Calculates regression equations for each column in each table.
        Proceeds to analyze the current statistics first to see if they
        fail a cutoff threshold and then to see if they can be predicted
        within acceptable uncertainty (standard deviation about the regression line).
        
        Returns True if all columns pass and False if they do not.
        
        
        :param fpath: The file path to use.
        """
        tableRegressions = {}
        result = False
        p = Psyco(fpath)

        # get tables, check against minCount (will return true if mincount is low but will also send alert message)
        tables = p.getData(
            "SELECT distinct on(table_name) table_schema,table_name FROM information_schema.tables WHERE table_schema ILIKE '%"
            + self.__schema
            + "%'"
        )
        tdict = {}
        schemas = [x[0] for x in tables]
        for table in tables:
            tdict[table[1]] = {}

        # get columns and counts, if anything is extraneous (send email and return False)
        columnCounts = {}
        results = {}
        for k in tdict.keys():
            for schema in schemas:
                try:
                    queryCols = [
                        x[0]
                        for x in p.getData(
                            "SELECT distinct(column_name) FROM information_schema.columns WHERE table_schema ILIKE '"
                            + schema
                            + "' AND table_name ILIKE '"
                            + k
                            + "'"
                        )
                    ]
                    sql = p.getData(
                        "SELECT " + ",".join([str("count(" + x + ")") for x in queryCols]) + " FROM " + schema + "." + k
                    )

                    # set the table data points or add to existing data points
                    if len(queryCols) is len(sql[0]):
                        pass

                except Exception, e:
                    print "Failure to Find a Column in a Table or Error in Returned Values"
                    self.__err.error(e, traceback.extract_tb(sys.exc_info()[2]))
Esempio n. 2
0
 def getFolder(self,folder,schema,confPath,checkTable="folders.names",setup=True,createImages=False):
     '''
     Get the Folder name from the database. Set up the directory
     as needed. Default is to set up the directory.
     
          
     *Required Parameters*
     :param folder: folder name to use in searching for existing folders (e.g. ILIKE '%${folder}%')
     :param schema: schema to pull from 
     :param confPath: path to configuration file with database information
     
     *Optional
     :param setup: whether to setup the 
     :param createImages: whether or not to create an Images directory
     :param checkTable: the table to use in checking status and performing operations
     '''
     cfp=Config(confPath)
     p=Psyco(confPath)
     #get any 'running' folder names
     folder=[x[0] for x in p.getData("SELECT folder FROM folders.names WHERE folder ILIKE '"+folder+"' AND status IS false")]
     if folder is None or len(folder) is 0:
         #generate schema name AND update
         folder="FLCrawl"+datetime.datetime.fromtimestamp(time.time()).strftime("%m%d%Y")
         if setup is True:
             setup(folder,"folders.names")
     else:
         folder=folder[0]
         
     return folder
    def parse(self):
        """
        Run the parser.
        """
        cfp=Config(self.__fpath)
        runlist=[]
        
        files=self.getXmlFiles(cfp.getVar("base", "basedir","string"))
        print "Files Found: ",
        print sorted(files)
        
        #get completed crawlers and parsers
        p=Psyco(self.__fpath)
        data=p.getData("SELECT q1.name,q1.stamp FROM (SELECT * FROM (SELECT name,max(date) as stamp FROM crawlers.ca_crim_weekly_complete GROUP BY name) as q1 UNION (SELECT name,max(date) FROM crawlers.ca_crim_monthly_complete GROUP BY name UNION SELECT name,max(date) FROM crawlers.ca_crim_daily_complete GROUP BY name)) as q1 LEFT OUTER JOIN (SELECT name,crawlstamp FROM crawlers.ca_crim_parsers_complete) as q2 ON q1.stamp = q2.crawlstamp AND regexp_replace(q1.name,'.py','') = regexp_replace(q2.name,'.xml','') WHERE q2.name IS NULL")      
        
        
        nilList=[]
        for xfile in data:
            if re.sub('\.py.*','',xfile[0])+".xml" not in files:
                nilList.append(re.sub('\.py.*','',xfile[0])+".xml") 
                
        if len(nilList) >0:
            print "Files Do Not Exist for the Following (the name but not the extension must match)",
            print sorted(nilList)
        else:
            print "All Crawlers Have Corresponding Parsers"
        
        #get datestamp dict -- assumes that the query gets the max(date)
        dates={}
        for xfile in data:
            fp=re.sub('\.py.*','',xfile[0])+".xml"
                     
            if fp not in dates:
                dates[fp]=xfile[1]

        
        for xfile in data:
            if xfile is not None:
                fp=xfile[0]
                
                if fp.replace('.py','.xml') in files:
                    runlist.append(fp.replace('.py','.xml'))
                   
        print "Execution List: ",
        print sorted(runlist)
        
        nilList=[]
        for xfile in data:
            if re.sub('\.py.*','',xfile[0])+".xml" not in files:
                nilList.append(re.sub('\.py.*','',xfile[0])+".xml") 
        
        if len(nilList) >0:
            print "Parsers that may not have been Found",
            print sorted(nilList)
        else:
            print "All Completed and Found crawlers Accounted For"

        #run the crawlers from the run list
        if len(runlist) > 0:
            pnum=0
            pool=[]
            #get the semaphore and lock
            sema=IncSemaphore()
            lock=Lock()
            
            #max processes
            maxprocs=cfp.getVar("opts","maxdel","int")
            print "Max Processes: "+str(maxprocs)
            #run
            cfp=Config(self.__fpath)
            while len(runlist) >0:
        
                if pnum<maxprocs and len(runlist)>0 and runlist[0] in files:
                    ex=runlist.pop()
                    print "Starting "+ex
                    w=Worker(self.__fpath,ex.strip(),cfp.getVar("base","logbase","string"),cfp.getVar("db", "dbname","string"),cfp.getVar("db", "user","string"),cfp.getVar("db", "passw","string"),cfp.getVar("db", "host","string"), cfp.getVar("db", "port","string"),sema,lock,dates[ex],cfp.getVar("jars","parser","string"))
                    w.start()
                    pool.append(w)
                    pnum+=1
                    print "Continuing"
                
                    
                while sema.getValue() >= maxprocs and len(pool) > 0:
                    print "Waiting for Parsers to Complete"
                    time.sleep(random.randint(1,120))
                    
                    for proc in pool:
                        if not proc.isAlive():
                            pool.remove(proc)
                            del proc
    
                            pnum-=1
                        
                            if len(pool) is 0 and sema.getValue() >0:
                                sema.setValue(0, lock)
                            
                            gc.collect()
                            del gc.garbage[:]
                
                for proc in pool:
                    if not proc.isAlive():
                        pool.remove(proc)
                        del proc

                        pnum-=1
                    
                        if len(pool) is 0 and sema.getValue() >0:
                            sema.setValue(0, lock)
                        
                        gc.collect()
                        del gc.garbage[:]
                
                if sema.getValue()==0 and len(runlist)==0:
                    break
            time.sleep(30)
            
            print "Completed Loop. Awaiting any final Completions."
            gc.collect()
            del gc.garbage[:]
            
            while sema.getValue()>0 and len(pool)>0:
                print "Waiting for Completion"
                time.sleep(random.randint(1,120))
            
            if len(pool) is 0 and sema.getValue() >0:
                    sema.setValue(0, lock)
            
                
                
            print "Current Execution List Complete. Will Restart Soon!"