Example #1
0
class GetImage(object):
    '''
    The class GetImage contains all methods of the GetImage module.
    '''


    def __init__(self):
        '''
        Constructor
        '''
        self.__err=Errors()
        self.__appender=False
        self.__imtype=".jpg"
        self.__images=[]
    
    def quickpixelCompare(self,image,imageSet):
        '''
        Most methods of silhouette detection won't require the entire algorithm to be run, just a comparison against the training set of images.
        Expects converted numpy arrays placed in an array.
       '''
        found=False
        i=0
        
        while found is False and i< len(imageSet):
            if image-imageSet.size is 0:
                found=True
            i+=1
        return found
    
        
    def flatten(self,image):
        '''
        Flatten an image and return its points as a concatenated line
        '''
        flattened=[]
        height,width=image.size
        for i in range(0,width):
            for j in range(0,height):
                flattened.append(image.getpixel(i,j))
        return flattened
    
    def extractFace(self,image,cascade):
        '''
        This method gets the Face from OpenCV haar cascades. Used in the generation
        of the actual image sets
        '''
        faceImage=None
        faces=cascade.detectMultiScale(cv2.cvtColor(image,cv2.COLOR_BAYER_BG2GRAY),1.3,5)
        
        #there should only be one
        for (x,y,w,h) in faces:
            faceImage=image.crop((x,y,x+w,h+y))
        return faceImage
    
    def getGreyScale(self,image):
        '''
        Get the greyscale image 
        '''
        image=image.convert("L")
        return image
    
    def prep(self,image):
        '''
        Resize and increase contrast for each image in the set, save in a temporary folder with originalname + temp
        This is based on what I know about getting to a feature extractable point with OCR.
        '''     
        image=image.resize((400,800),Image.BILINEAR)
        enhancer=ImageEnhance.Contrast(image)
        image=enhancer.enhance(1.5)
        
        return image

    
    def getMeanFace(self,imageSet):
        '''
        Get the Mean Face. This is used to determine the difference between faces.
        ImageSet should be a nested numpy array with each row being an image.
        '''
        meanimage=None
        if imageSet is not None:    
            m=imageSet.size
            
            for i in range(0,imageSet.size):
                if i is 0:
                    meanimage=imageSet[i]
                else:
                    meanimage=meanimage+imageSet[i]
            if meanimage is not None:
                meanimage=meanimage/m
        return meanimage
    
    def getCovarSet(self,imageSet,meanImage):
        '''
        Get the Covariance set. 
        '''
        covarSet=numpy.shape(imageSet.size,imageSet[0].size)
        for i in range(0,imageSet.size):
            covarSet[i]=imageSet-meanImage
        return covarSet
    
    def getTransposeMatrix(self,imageSet,meanImage):
        '''
        Get Covariance matrix for an image set which is basically a set of average vectors
        A square matrix should be generated in this fashion by which eigenvectors are attainable.
        (e.g. a 4,3 becomes a 4*3,3*4)
        '''
        #get the vectors
        if imageSet.size>0:
                
            transposeSet=numpy.transpose(imageSet)
        
        return ((imageSet*transposeSet)/imageSet.size)
                
        
    def getEigenVect(self,imageSet,vectorSet):
        '''
        Computes the eigenvector of the covariances (obtains an equation set of vectors that satisfy the equation of 0 covariance)
        Eigenvectors are the result of setting a diagonal to zero and solving then adding.
        '''
        rank=numpy.rank(imageSet)
        evect=numpy.linalg.eig(imageSet)
        eval=numpy.diagonal(evect)[0:rank]
        return (evect,eval,rank)
    
    def getEigenFaces(self,evect,eval,imageSet):
        '''
        Computes the eigenfaces for comparison
        '''
        efaces=None
        if imageSet.size >0:
            efaces=numpy.shape(imageSet.size,imageSet[0].size)
            for i in range(0,imageSet.size):
                pixrank=0
                for j in range(0,imageSet[i].size):
                    pixrank+=evect[i][j]*imageSet[i]
                efaces[i][j]=pixrank
        
        return efaces
    
    def buildTrainingSet(self,trainDataDirectory,PCA=False):
        '''
        Store flattened training images from trainDataDirectory in an array for comparison.
        
        Images are converted to greyscale for 2-D array
        
        *Required Parameters*
        :param trainDataDirectory: directory to train data from
        
        *Optional Parameters*
        :param pca: whether or not to perform PCA analysis
        
        '''
        files=[x for x in os.listdir(trainDataDirectory) if os.path.isfile(trainDataDirectory+x) is True]
        with open(trainDataDirectory,'rb') as fp:
            if PCA is False:
                self.__images.append(numpy.asarray(self.flatten(Image.fromarray(self.greyscale(numpy.asarray(Image.open(fp).convert('L')))))))
            else:
                #store Images with their PCA equivalents (reducing dimensionality of each image)
                pass
    
    def predictEigenFaces(self,image):
        '''
        Make a prediction using eigenfaces. Requires that a training set have been built. 
        
        The image mean is taken, weights are obtained, and then these weights are compared
        to a weighted image set with highest comparison resulting in the best match.
        
        Euclidean distance is used to break ties.
        
        *Required Parameter*
        :param image: PIL Image to test against (will be greyscaled and flattened)
        '''
        pass
    
    def performPCA(self,Image):
        '''
        Performs PCA on a PIL Image (no text or numerical data)
        
        *Required Parameters*
        :param Image: PIL image to use
        '''
        pass
    
    def predictPCAAlgo(self,Image):
        '''
        Make a prediction with PCA and clustering. Requires  that a training set have been built.
        
        Images are flattened and grey scaled. Mean matrices acquired as in EigenFaces. Dimensionality
        is then reduced with the reduction used to find the closest images. Ties are resolved with Euclidean
        distances. 
        '''
        pass
    
    def performTessOCR(self,imagePath):
        """
        Performs OCR with Tesseract. Please train Tesseract Elsewhere if necessary.
        JTessBox is a nice program implemented with VietOCR. SVM is used for OCR
        (e.g. covariance matrices with letter images appended as pixels) or transforms.
        
        *Required Parameters*
        
        :param imagePath: string ipath,fp,cstr of Image, or PIL Image
        """
        if type(imagePath) is str or type(imagePath) is cStringIO or type(imagePath) is file:
            return image_to_string(Image.open(imagePath),True)
        else:
            return image_to_string(imagePath, True)
        
    def resize(self,img,xfactor=2,yfactor=2,ipol=cv2.INTER_LINEAR):
        """
        Takes in Image bytes  and resizes in cv2 with a specified or default interpolation to preserve
        quality.
        
        Returns a PIL/pillow image
        
        *Required Parameters*
        
        :param img: image as PIL Image
    
        *Optional Parameters*
        
        :param xfactor: xscaling (2 default)
        :param yfactor: yscaling (2 default
        :param ipol: cv2 interpolation
        """
        
        nparr=None
        if str is type(img):
            cstr=bytearray(img)
            nparr=np.asarray(cstr,dtype=np.uint8).copy()
            cvi=cv2.imdecode(nparr,0)
            cvi=cv2.resize(cvi,None, fx=xfactor, fy=yfactor, interpolation = ipol)
            img=Image.fromarray(cvi)
        else:
            img=cv2.resize(img,(img.shape[1]*2,img.shape[0]*2)) 
                
        return img
     
    def greyscale(self,inarr):
        """
        Greyscale an image from a numpy array. Returns the morphed numpy array.
        """
        arr=inarr
        arr[arr<128]=0
        arr[arr>=128]=255
        
        return arr
    
    def declutter(self,inarr,sdMul=1.5):
        """
        Declutter an Image using basic statistics (outliers of height and weight).
        Returns the morphed numpy array input.
        
        *Required Parameters*
        
        :param inarr: numpy array representing an image
        
        *Optional Parameters*
        
        :param sdMul: number of standard distributions from the average +||- so 2 *sdMul
        """
        arr=inarr
        height=len(arr)
        width=len(arr[0])
        
        total=0
        ws=0
        j=0
        avg=0
        account=False
        wsarr=[]
        
        #get the avg, total
        for i in range(height):
            for c in arr[i]:
                if c<128 and account is False:
                    ws=j
                    account=True
                elif c>128 and account is True:
                    account=False
                    total+=1
                    avg+=(j-ws)
                    wsarr.append((j-ws))
        avg/=total
        sd=0
        
        #calculate sd
        for n in wsarr:
            sd+=((n-avg)*(n-avg))
        
        sd=math.sqrt((sd/((total-1))))
        o=sdMul*sd
        
        ws=0
        #perform declutter
        for i in range(height):
            for c in arr[i]:
                if c<128 and account is False:
                    ws=j
                    account=True
                elif c>128 and account is True:
                    account=False
                    total+=1
                    
                    if (j-ws) > (avg+o) or (j-ws) <(avg-o):
                        for j in range(j-ws):
                            arr[(ws+j)]=255
        return arr
    
    
    def sciKitDeclutter(self,nparr,greater=True,eo=True):
        '''
        Attempt to discover noise using clustering with means and outliers to find if objects belong 
        to a noise or acceptable category. Groups are generated from this distribution and group 
        statistics found. If a group sits either greater or lesser than the average number of objects
        (greater is the normal) it is accepted. Otherwise, it is rejected.
        
        *Required Params*
        :param nparr: a numpy aray of a black and white image
        
        *Optional Parameters*
        :param greater: Whether to Choose groups greater than the avearage number (default True)
        :parm eo: Whether to use the 3rd standard deviation (default is slightly above 1.5)
        '''
        lengths=[]
        start=0
        #generate lengths from array
        for j in xrange(nparr.shape[0]):
            found=False
            for i in xrange(nparr.shape[1]):
                #iterate and find
                if nparr[j][i] <128 and found is False:
                        found=True
                        start=i
                elif nparr[j][i]>=128 and found is True:
                    found=False
                    if i-start>0:
                        lengths.append(i-start)
            
            if found is True:
                lengths.append(nparr.shape[1]-1-start)
        
        lengths=[x for x in lengths if x>1]
        #cluster with kmeans
        n=int(math.sqrt(len(lengths))/2)
        
        #get the lengths
        lengths=sorted(lengths)
        kmn=KMeans(n_clusters=n)
        cats=kmn.fit_predict([[l] for l in lengths])#fit everything since there is nothing else to fit
        
        centroids=[[] for x in xrange(n)]
        for i in xrange(len(lengths)):
            centroids[cats[i]].append(lengths[i])
        
        centroids=[x for x in centroids if len(x)>1]
        points=copy.deepcopy(centroids)
        for i in xrange(len(centroids)):
            centroids[i]=numpy.mean(centroids[i])
        

        gc.collect()
        del gc.garbage[:]
        
        gradients=[]
        #find maximum and minimum points with spline (each maximum represents a grouping of objects
        #an actual derivative may be better than a window-like function
        #find the gaussian kernel around the groupings
        for i in xrange(len(points)):
            datarr=[numpy.std(points[i]),numpy.mean(points[i])]
            gradients.append(datarr)
        
        del points
        gc.collect()
        del gc.garbage[:]
        discards=0
        #discard anything that does not fit into a category
        #generate lengths from array
        for j in xrange(nparr.shape[0]):
            start=0
            found=False
            for i in xrange(nparr.shape[1]):
                #iterate and find
                if nparr[j][i] <128 and found is False:
                        found=True
                        start=i
                elif nparr[j][i]>=128 and found is True:
                        found=False
                        if i-start>0:
                            length=i-start
                            #find the best category
                            min=sys.maxint
                            cat=-1
                            for k in xrange(len(centroids)):
                                d=float(abs(length-gradients[k][1]))
                                if float(d)<=float(min):
                                    min=d
                                    cat=k
                            
                            if cat>-1:
                                #find if this length falls outside of the extent of the category
                                score=abs(length-gradients[cat][1])/gradients[cat][0]
                                
                                #score=(1/(math.sqrt(2*math.pi*gradients[cat][0])))*math.exp(-1*math.pow(((length-gradients[cat][1])/(2*gradients[cat][0])),2))
                                maxscore=1.5
                                
                                if eo is False:
                                    maxscore=3
                                
                                if score>maxscore:
                                    #discard if the probability of finding the point is within 1.5-2 deviations or 3 deviations away
                                    #5.55 from 100% or 95.45 percentile means 2 deviations 99.73% is 3
                                    
                                    #reset the points within the object
                                    discards+=1
                                    for k in xrange(length):
                                        nparr[j][start+k]=255
                
            if found is True:
                length=nparr.shape[1]-1-start
                #find the best category
                min=sys.maxint
                cat=-1
                for k in xrange(len(centroids)):
                    d=float(abs(length-gradients[k][1]))
                    if float(d)<=float(min):
                        min=d
                        cat=k
                
                if cat>-1:
                    #find if this length falls outside of the extent of the category
                    score=abs(length-gradients[cat][1])/gradients[cat][0]
                    
                    #score=(1/(math.sqrt(2*math.pi*gradients[cat][0])))*math.exp(-1*math.pow(((length-gradients[cat][1])/(2*gradients[cat][0])),2))
                    maxscore=1.5
                    
                    if eo is False:
                        maxscore=3
                    
                    if score>maxscore:
                        #discard if the probability of finding the point is within 1.5-2 deviations or 3 deviations away
                        #5.55 from 100% or 95.45 percentile means 2 deviations 99.73% is 3
                        
                        #reset the points within the object
                        discards+=1
                        for k in xrange(length):
                            nparr[j][start+k]=255
                            
                
        print "Discarded "+str(discards)+" Objects"
        gc.collect()
        del gc.garbage[:]
        
        return nparr
        
    def getContours(self,img):
        """
        Get Contours from CV2
        """
        hsv_img=cv2.cvtColor(img,cv2.COLOR_BAYER_BG2BGR)
        hsv_img=cv2.medianBlur(hsv_img,5)
        sv_img=cv2.cvtColor(img,cv2.COLOR_BAYER_BG2BGR)
        hsv_img=cv2.medianBlur(hsv_img,5)
        kernel=np.ones((5,5),'uint8')
        hsv_img=cv2.dilate(cv2.erode(hsv_img,kernel,iterations=3),kernel)
        
        COLOR_MIN=np.array([255,255,255],np.uint8)
        COLOR_MAX=np.array([255,255,255],np.uint8)
        
        frame_threshed=cv2.inRange(hsv_img,COLOR_MAX,COLOR_MIN)
        #img=cv2.imread("/home/aevans/Documents/testimages/ElPasoAOC/elpasoAOC_1424991951.2_14.jpg",0)
        #ret,thresh=cv2.threshold(img,127,255,0)
        ret,thresh=cv2.threshold(frame_threshed,127,255,0)
        return cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)    
        
    
    @property
    def imtype(self):
        return self.__imtype
        
    @imtype.setter
    def imtype(self,imtype):
        """
        Set the image type to use ('.jpg';etc.)
        """
        self.__imtype=imtype
    
    @property
    def appender(self):
        return self.__appender
        
    @appender.setter
    def appender(self,appender):
        """
        Set an image appender to append priort to the type but after the name.
        """
        self.__appender=appender
    
    def getImageSpynner(self,baseurl,spynner,iser,wait_time,proxy):
        """
        Directly get an Image with Spynner.
        
        *Required Parameters*
        
        :param baseurl: base url to use  with link (a blank string is nothing)
        :param spynner: spynner instance
        :param iser: selector for image
        :param wait_time: time to wait in acquiring an image
        :param proxy: String proxy
        """
        br=spynner
        print "Downloading..."+str(iser["src"])
        return br.download(urlnorm.norm(baseurl+iser["src"]),outfd=None,timeout=wait_time,proxy_url=proxy)
    
    def getImage(self,opener,url,data,wait_time):
        """
        Directly get an Image using URLLib. Errors Must be handled.
        
        *Optional Parameters*
        
        :param opener: urllib opener to use (use GetPage for setup)
        :param url: url address to use
        :param data: data to use in request (like that passed to urlencode)
        :param wait_time: time to wait for request
        
        """
        return opener.open(urlnorm.norm(url),data,wait_time).read()
    
    def downloadURLLib(self,filepath,baseurl,data,html,opener,regex,page,wait_time,proxy):
        """
        Downloads with URLLib using the Opener to a directory.
        
        *Required Parameters*
        
        :param filepath: directory path to save to
        :param baseurl: base path to use in getting image (blank string is nothing) 
        :param data: data to use in request (as in urlencode)
        :param html: html to search for link
        :param opener: urllib opener
        :param regex: regular expression to use in search
        :param page: page dictionary
        :param wait_time: time to wait for request
        :param proxy: proxdy to use in request     
        """
        isers=Soup(html).find("img",{"src":re.compile(regex)})
        if isers is not None:
            imnum=0
            for iser in isers:
                print "Attempting to Get Image "+iser["src"]+" for "+page["offenderhash"]
                try:
                    infp=filepath+page["offenderhash"]
                    if self.__appender is True:
                        infp+="_"+str(imnum)
                    infp+=self.__imtype
                    with open(filepath+infp,'wb') as fp:
                        fp.write(self.getImage(opener,baseurl+iser["src"],data, wait_time))
                except Exception,e:
                    print "Failed to Get Image "+iser["src"]+" for "+page["offenderhash"]
                    self.__err.error(e,traceback.extract_tb(sys.exc_info()[2]))
            imnum+=1   
        else:
class QAMeasurer():
    
    def __init__(self,schema,cutoff,fpath,minCount=2,defaultFail=True):
        '''
        Constructor for QA Measurer.
        
        *Required Parameters*
        
        :param schema: schema to use
        :param cutoff: cutoff to use in determining acceptance
        :param fpath: File path to use with the
         
        *Optional Parameters*
        :param minCount: the minimum count that the number of entries must reach before calculation proceeds
        :param defaultFail: the default return value for a failure not due to the threshold or regression (e.g. no schemas)
        '''
        self.__schema=schema
        self.__cutoff_cutoff
        self.__minCount=minCount
        self.__defaultFail=defaultFail
        self.__err=Errors()
    
    def calculateRegression(self,fpath):
        '''
        Calculates regression equations for each column in each table.
        Proceeds to analyze the current statistics first to see if they
        fail a cutoff threshold and then to see if they can be predicted
        within acceptable uncertainty (standard deviation about the regression line).
        
        Returns True if all columns pass and False if they do not.
        '''
        tableRegressions={}
        result=False
        p=Psyco(fpath)
        
        #get tables, check against minCount (will return true if mincount is low but will also send alert message)
        tables=p.getData("SELECT distinct on(table_name) table_schema,table_name FROM information_schema.tables WHERE table_schema ILIKE '%"+self.__schema+"%'")
        tdict={}
        schemas=[x[0] for x in tables]
        for table in tables:
            tdict[table[1]]={}
        
        #get columns and counts, if anything is extraneous (send email and return False)
        columnCounts={}
        results={}
        for k in tdict.keys():
            for schema in schemas:
                try:
                    queryCols=[x[0] for x in p.getData("SELECT distinct(column_name) FROM information_schema.columns WHERE table_schema ILIKE '"+schema+"' AND table_name ILIKE '"+k+"'")]
                    sql=p.getData("SELECT "+",".join([str("count("+x+")") for x in queryCols])+" FROM "+schema+"."+k)          
                    
                    #set the table data points or add to existing data points 
                    if len(queryCols) is len(sql[0]):
                        pass
                    
                except Exception,e:
                    print "Failure to Find a Column in a Table or Error in Returned Values"
                    self.__err.error(e,traceback.extract_tb(sys.exc_info()[2]))
        
        #get the data points from the sql queries (limits calls to if statement at the cost of a limited amount of memory)
        for table in results.keys():
            for i in range(0,len(results[table]["queryCols"])):
                if queryCols[i] in tdict[k].keys():
                    tdict[k][queryCols[i]].append(int(result[i]))
                        
        
        #FOR EACH TABLE (replace existing data points with eq.):
        #calculate m (Sxy/Sxx) and intercept (yavg-(m*xavg)) and uncertainty (sqrt((Syy-(pow(m,2)*Sxx))/(N-2))
        #and calculate prediction which if fails return false with full report email
        for k in tdict.keys():
            pass
        
        gc.collect()
        del gc.garbage[:]