class GetImage(object): ''' The class GetImage contains all methods of the GetImage module. ''' def __init__(self): ''' Constructor ''' self.__err=Errors() self.__appender=False self.__imtype=".jpg" self.__images=[] def quickpixelCompare(self,image,imageSet): ''' Most methods of silhouette detection won't require the entire algorithm to be run, just a comparison against the training set of images. Expects converted numpy arrays placed in an array. ''' found=False i=0 while found is False and i< len(imageSet): if image-imageSet.size is 0: found=True i+=1 return found def flatten(self,image): ''' Flatten an image and return its points as a concatenated line ''' flattened=[] height,width=image.size for i in range(0,width): for j in range(0,height): flattened.append(image.getpixel(i,j)) return flattened def extractFace(self,image,cascade): ''' This method gets the Face from OpenCV haar cascades. Used in the generation of the actual image sets ''' faceImage=None faces=cascade.detectMultiScale(cv2.cvtColor(image,cv2.COLOR_BAYER_BG2GRAY),1.3,5) #there should only be one for (x,y,w,h) in faces: faceImage=image.crop((x,y,x+w,h+y)) return faceImage def getGreyScale(self,image): ''' Get the greyscale image ''' image=image.convert("L") return image def prep(self,image): ''' Resize and increase contrast for each image in the set, save in a temporary folder with originalname + temp This is based on what I know about getting to a feature extractable point with OCR. ''' image=image.resize((400,800),Image.BILINEAR) enhancer=ImageEnhance.Contrast(image) image=enhancer.enhance(1.5) return image def getMeanFace(self,imageSet): ''' Get the Mean Face. This is used to determine the difference between faces. ImageSet should be a nested numpy array with each row being an image. ''' meanimage=None if imageSet is not None: m=imageSet.size for i in range(0,imageSet.size): if i is 0: meanimage=imageSet[i] else: meanimage=meanimage+imageSet[i] if meanimage is not None: meanimage=meanimage/m return meanimage def getCovarSet(self,imageSet,meanImage): ''' Get the Covariance set. ''' covarSet=numpy.shape(imageSet.size,imageSet[0].size) for i in range(0,imageSet.size): covarSet[i]=imageSet-meanImage return covarSet def getTransposeMatrix(self,imageSet,meanImage): ''' Get Covariance matrix for an image set which is basically a set of average vectors A square matrix should be generated in this fashion by which eigenvectors are attainable. (e.g. a 4,3 becomes a 4*3,3*4) ''' #get the vectors if imageSet.size>0: transposeSet=numpy.transpose(imageSet) return ((imageSet*transposeSet)/imageSet.size) def getEigenVect(self,imageSet,vectorSet): ''' Computes the eigenvector of the covariances (obtains an equation set of vectors that satisfy the equation of 0 covariance) Eigenvectors are the result of setting a diagonal to zero and solving then adding. ''' rank=numpy.rank(imageSet) evect=numpy.linalg.eig(imageSet) eval=numpy.diagonal(evect)[0:rank] return (evect,eval,rank) def getEigenFaces(self,evect,eval,imageSet): ''' Computes the eigenfaces for comparison ''' efaces=None if imageSet.size >0: efaces=numpy.shape(imageSet.size,imageSet[0].size) for i in range(0,imageSet.size): pixrank=0 for j in range(0,imageSet[i].size): pixrank+=evect[i][j]*imageSet[i] efaces[i][j]=pixrank return efaces def buildTrainingSet(self,trainDataDirectory,PCA=False): ''' Store flattened training images from trainDataDirectory in an array for comparison. Images are converted to greyscale for 2-D array *Required Parameters* :param trainDataDirectory: directory to train data from *Optional Parameters* :param pca: whether or not to perform PCA analysis ''' files=[x for x in os.listdir(trainDataDirectory) if os.path.isfile(trainDataDirectory+x) is True] with open(trainDataDirectory,'rb') as fp: if PCA is False: self.__images.append(numpy.asarray(self.flatten(Image.fromarray(self.greyscale(numpy.asarray(Image.open(fp).convert('L'))))))) else: #store Images with their PCA equivalents (reducing dimensionality of each image) pass def predictEigenFaces(self,image): ''' Make a prediction using eigenfaces. Requires that a training set have been built. The image mean is taken, weights are obtained, and then these weights are compared to a weighted image set with highest comparison resulting in the best match. Euclidean distance is used to break ties. *Required Parameter* :param image: PIL Image to test against (will be greyscaled and flattened) ''' pass def performPCA(self,Image): ''' Performs PCA on a PIL Image (no text or numerical data) *Required Parameters* :param Image: PIL image to use ''' pass def predictPCAAlgo(self,Image): ''' Make a prediction with PCA and clustering. Requires that a training set have been built. Images are flattened and grey scaled. Mean matrices acquired as in EigenFaces. Dimensionality is then reduced with the reduction used to find the closest images. Ties are resolved with Euclidean distances. ''' pass def performTessOCR(self,imagePath): """ Performs OCR with Tesseract. Please train Tesseract Elsewhere if necessary. JTessBox is a nice program implemented with VietOCR. SVM is used for OCR (e.g. covariance matrices with letter images appended as pixels) or transforms. *Required Parameters* :param imagePath: string ipath,fp,cstr of Image, or PIL Image """ if type(imagePath) is str or type(imagePath) is cStringIO or type(imagePath) is file: return image_to_string(Image.open(imagePath),True) else: return image_to_string(imagePath, True) def resize(self,img,xfactor=2,yfactor=2,ipol=cv2.INTER_LINEAR): """ Takes in Image bytes and resizes in cv2 with a specified or default interpolation to preserve quality. Returns a PIL/pillow image *Required Parameters* :param img: image as PIL Image *Optional Parameters* :param xfactor: xscaling (2 default) :param yfactor: yscaling (2 default :param ipol: cv2 interpolation """ nparr=None if str is type(img): cstr=bytearray(img) nparr=np.asarray(cstr,dtype=np.uint8).copy() cvi=cv2.imdecode(nparr,0) cvi=cv2.resize(cvi,None, fx=xfactor, fy=yfactor, interpolation = ipol) img=Image.fromarray(cvi) else: img=cv2.resize(img,(img.shape[1]*2,img.shape[0]*2)) return img def greyscale(self,inarr): """ Greyscale an image from a numpy array. Returns the morphed numpy array. """ arr=inarr arr[arr<128]=0 arr[arr>=128]=255 return arr def declutter(self,inarr,sdMul=1.5): """ Declutter an Image using basic statistics (outliers of height and weight). Returns the morphed numpy array input. *Required Parameters* :param inarr: numpy array representing an image *Optional Parameters* :param sdMul: number of standard distributions from the average +||- so 2 *sdMul """ arr=inarr height=len(arr) width=len(arr[0]) total=0 ws=0 j=0 avg=0 account=False wsarr=[] #get the avg, total for i in range(height): for c in arr[i]: if c<128 and account is False: ws=j account=True elif c>128 and account is True: account=False total+=1 avg+=(j-ws) wsarr.append((j-ws)) avg/=total sd=0 #calculate sd for n in wsarr: sd+=((n-avg)*(n-avg)) sd=math.sqrt((sd/((total-1)))) o=sdMul*sd ws=0 #perform declutter for i in range(height): for c in arr[i]: if c<128 and account is False: ws=j account=True elif c>128 and account is True: account=False total+=1 if (j-ws) > (avg+o) or (j-ws) <(avg-o): for j in range(j-ws): arr[(ws+j)]=255 return arr def sciKitDeclutter(self,nparr,greater=True,eo=True): ''' Attempt to discover noise using clustering with means and outliers to find if objects belong to a noise or acceptable category. Groups are generated from this distribution and group statistics found. If a group sits either greater or lesser than the average number of objects (greater is the normal) it is accepted. Otherwise, it is rejected. *Required Params* :param nparr: a numpy aray of a black and white image *Optional Parameters* :param greater: Whether to Choose groups greater than the avearage number (default True) :parm eo: Whether to use the 3rd standard deviation (default is slightly above 1.5) ''' lengths=[] start=0 #generate lengths from array for j in xrange(nparr.shape[0]): found=False for i in xrange(nparr.shape[1]): #iterate and find if nparr[j][i] <128 and found is False: found=True start=i elif nparr[j][i]>=128 and found is True: found=False if i-start>0: lengths.append(i-start) if found is True: lengths.append(nparr.shape[1]-1-start) lengths=[x for x in lengths if x>1] #cluster with kmeans n=int(math.sqrt(len(lengths))/2) #get the lengths lengths=sorted(lengths) kmn=KMeans(n_clusters=n) cats=kmn.fit_predict([[l] for l in lengths])#fit everything since there is nothing else to fit centroids=[[] for x in xrange(n)] for i in xrange(len(lengths)): centroids[cats[i]].append(lengths[i]) centroids=[x for x in centroids if len(x)>1] points=copy.deepcopy(centroids) for i in xrange(len(centroids)): centroids[i]=numpy.mean(centroids[i]) gc.collect() del gc.garbage[:] gradients=[] #find maximum and minimum points with spline (each maximum represents a grouping of objects #an actual derivative may be better than a window-like function #find the gaussian kernel around the groupings for i in xrange(len(points)): datarr=[numpy.std(points[i]),numpy.mean(points[i])] gradients.append(datarr) del points gc.collect() del gc.garbage[:] discards=0 #discard anything that does not fit into a category #generate lengths from array for j in xrange(nparr.shape[0]): start=0 found=False for i in xrange(nparr.shape[1]): #iterate and find if nparr[j][i] <128 and found is False: found=True start=i elif nparr[j][i]>=128 and found is True: found=False if i-start>0: length=i-start #find the best category min=sys.maxint cat=-1 for k in xrange(len(centroids)): d=float(abs(length-gradients[k][1])) if float(d)<=float(min): min=d cat=k if cat>-1: #find if this length falls outside of the extent of the category score=abs(length-gradients[cat][1])/gradients[cat][0] #score=(1/(math.sqrt(2*math.pi*gradients[cat][0])))*math.exp(-1*math.pow(((length-gradients[cat][1])/(2*gradients[cat][0])),2)) maxscore=1.5 if eo is False: maxscore=3 if score>maxscore: #discard if the probability of finding the point is within 1.5-2 deviations or 3 deviations away #5.55 from 100% or 95.45 percentile means 2 deviations 99.73% is 3 #reset the points within the object discards+=1 for k in xrange(length): nparr[j][start+k]=255 if found is True: length=nparr.shape[1]-1-start #find the best category min=sys.maxint cat=-1 for k in xrange(len(centroids)): d=float(abs(length-gradients[k][1])) if float(d)<=float(min): min=d cat=k if cat>-1: #find if this length falls outside of the extent of the category score=abs(length-gradients[cat][1])/gradients[cat][0] #score=(1/(math.sqrt(2*math.pi*gradients[cat][0])))*math.exp(-1*math.pow(((length-gradients[cat][1])/(2*gradients[cat][0])),2)) maxscore=1.5 if eo is False: maxscore=3 if score>maxscore: #discard if the probability of finding the point is within 1.5-2 deviations or 3 deviations away #5.55 from 100% or 95.45 percentile means 2 deviations 99.73% is 3 #reset the points within the object discards+=1 for k in xrange(length): nparr[j][start+k]=255 print "Discarded "+str(discards)+" Objects" gc.collect() del gc.garbage[:] return nparr def getContours(self,img): """ Get Contours from CV2 """ hsv_img=cv2.cvtColor(img,cv2.COLOR_BAYER_BG2BGR) hsv_img=cv2.medianBlur(hsv_img,5) sv_img=cv2.cvtColor(img,cv2.COLOR_BAYER_BG2BGR) hsv_img=cv2.medianBlur(hsv_img,5) kernel=np.ones((5,5),'uint8') hsv_img=cv2.dilate(cv2.erode(hsv_img,kernel,iterations=3),kernel) COLOR_MIN=np.array([255,255,255],np.uint8) COLOR_MAX=np.array([255,255,255],np.uint8) frame_threshed=cv2.inRange(hsv_img,COLOR_MAX,COLOR_MIN) #img=cv2.imread("/home/aevans/Documents/testimages/ElPasoAOC/elpasoAOC_1424991951.2_14.jpg",0) #ret,thresh=cv2.threshold(img,127,255,0) ret,thresh=cv2.threshold(frame_threshed,127,255,0) return cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) @property def imtype(self): return self.__imtype @imtype.setter def imtype(self,imtype): """ Set the image type to use ('.jpg';etc.) """ self.__imtype=imtype @property def appender(self): return self.__appender @appender.setter def appender(self,appender): """ Set an image appender to append priort to the type but after the name. """ self.__appender=appender def getImageSpynner(self,baseurl,spynner,iser,wait_time,proxy): """ Directly get an Image with Spynner. *Required Parameters* :param baseurl: base url to use with link (a blank string is nothing) :param spynner: spynner instance :param iser: selector for image :param wait_time: time to wait in acquiring an image :param proxy: String proxy """ br=spynner print "Downloading..."+str(iser["src"]) return br.download(urlnorm.norm(baseurl+iser["src"]),outfd=None,timeout=wait_time,proxy_url=proxy) def getImage(self,opener,url,data,wait_time): """ Directly get an Image using URLLib. Errors Must be handled. *Optional Parameters* :param opener: urllib opener to use (use GetPage for setup) :param url: url address to use :param data: data to use in request (like that passed to urlencode) :param wait_time: time to wait for request """ return opener.open(urlnorm.norm(url),data,wait_time).read() def downloadURLLib(self,filepath,baseurl,data,html,opener,regex,page,wait_time,proxy): """ Downloads with URLLib using the Opener to a directory. *Required Parameters* :param filepath: directory path to save to :param baseurl: base path to use in getting image (blank string is nothing) :param data: data to use in request (as in urlencode) :param html: html to search for link :param opener: urllib opener :param regex: regular expression to use in search :param page: page dictionary :param wait_time: time to wait for request :param proxy: proxdy to use in request """ isers=Soup(html).find("img",{"src":re.compile(regex)}) if isers is not None: imnum=0 for iser in isers: print "Attempting to Get Image "+iser["src"]+" for "+page["offenderhash"] try: infp=filepath+page["offenderhash"] if self.__appender is True: infp+="_"+str(imnum) infp+=self.__imtype with open(filepath+infp,'wb') as fp: fp.write(self.getImage(opener,baseurl+iser["src"],data, wait_time)) except Exception,e: print "Failed to Get Image "+iser["src"]+" for "+page["offenderhash"] self.__err.error(e,traceback.extract_tb(sys.exc_info()[2])) imnum+=1 else:
class QAMeasurer(): def __init__(self,schema,cutoff,fpath,minCount=2,defaultFail=True): ''' Constructor for QA Measurer. *Required Parameters* :param schema: schema to use :param cutoff: cutoff to use in determining acceptance :param fpath: File path to use with the *Optional Parameters* :param minCount: the minimum count that the number of entries must reach before calculation proceeds :param defaultFail: the default return value for a failure not due to the threshold or regression (e.g. no schemas) ''' self.__schema=schema self.__cutoff_cutoff self.__minCount=minCount self.__defaultFail=defaultFail self.__err=Errors() def calculateRegression(self,fpath): ''' Calculates regression equations for each column in each table. Proceeds to analyze the current statistics first to see if they fail a cutoff threshold and then to see if they can be predicted within acceptable uncertainty (standard deviation about the regression line). Returns True if all columns pass and False if they do not. ''' tableRegressions={} result=False p=Psyco(fpath) #get tables, check against minCount (will return true if mincount is low but will also send alert message) tables=p.getData("SELECT distinct on(table_name) table_schema,table_name FROM information_schema.tables WHERE table_schema ILIKE '%"+self.__schema+"%'") tdict={} schemas=[x[0] for x in tables] for table in tables: tdict[table[1]]={} #get columns and counts, if anything is extraneous (send email and return False) columnCounts={} results={} for k in tdict.keys(): for schema in schemas: try: queryCols=[x[0] for x in p.getData("SELECT distinct(column_name) FROM information_schema.columns WHERE table_schema ILIKE '"+schema+"' AND table_name ILIKE '"+k+"'")] sql=p.getData("SELECT "+",".join([str("count("+x+")") for x in queryCols])+" FROM "+schema+"."+k) #set the table data points or add to existing data points if len(queryCols) is len(sql[0]): pass except Exception,e: print "Failure to Find a Column in a Table or Error in Returned Values" self.__err.error(e,traceback.extract_tb(sys.exc_info()[2])) #get the data points from the sql queries (limits calls to if statement at the cost of a limited amount of memory) for table in results.keys(): for i in range(0,len(results[table]["queryCols"])): if queryCols[i] in tdict[k].keys(): tdict[k][queryCols[i]].append(int(result[i])) #FOR EACH TABLE (replace existing data points with eq.): #calculate m (Sxy/Sxx) and intercept (yavg-(m*xavg)) and uncertainty (sqrt((Syy-(pow(m,2)*Sxx))/(N-2)) #and calculate prediction which if fails return false with full report email for k in tdict.keys(): pass gc.collect() del gc.garbage[:]