def helperCompareArrays(self, first, second, decimal, msg): ''' There is no obvious way to compare masked arrays nicely. This is my attempt at a comparison. ''' self.assertSequenceEqual( first.shape, second.shape, "Correct shape: %s" % msg) if(ma.count(first) == 0 and ma.count(second) == 0): # if they are the same shape and have no data, they are the same return self.assertTrue( ma.allclose( first, second, atol=decimal), "Values: %s" % msg) try: np.testing.assert_array_almost_equal( ma.getmaskarray(first), ma.getmaskarray(second), decimal=decimal, err_msg=msg, verbose=True) except AssertionError as e: self.assertRaises(e)
def biweight(x, cst): """ Computes the biweight average and midvariance for a given 1D array. Returns a tuple (biweight mean, biweight variance). Parameters ---------- x: {ndarray} Input Array cst : {float} Parameter controlling how outliers are censored. Notes ----- The function is restricted to 1D data only. """ assert (x.ndim == 1, "1D array only !") xmed = ma.median(x, 0) manom = x - xmed mad = ma.median(ma.absolute(manom)) u_i = (manom/float(cst*mad)) u_i *= ma.less_equal(ma.absolute(u_i), 1.).astype(float) w_i = (1-u_i**2) if ma.count(w_i) > 0: biw_m = xmed + ma.sum(manom * w_i**2)/ma.sum(w_i**2) else: biw_m = xmed biw_sd = ma.sqrt(ma.count(x)*ma.sum(manom**2 * w_i**4)) biw_sd *= 1./ma.absolute(ma.sum(w_i * (1-5*u_i**2))) return (biw_m, biw_sd.item())
def test_remove_clip_box(self): """test that we can remove the clip box once set.""" self.gca.set_clip_box(-90, -75, -180, -165) testvec = self.gca.compress(self.grid) self.assertTrue(self.gca.is_masked()) self.assertEqual(ma.count(testvec), 2) self.gca.remove_mask() self.assertFalse(self.gca.is_masked()) testvec = self.gca.compress(self.grid) self.assertEqual(ma.count(testvec), 4)
def test_remove_clip_box(self) : """test that we can remove the clip box once set.""" self.gca.set_clip_box(-90,-75,-180,-165) testvec = self.gca.compress(self.grid) self.assertTrue(self.gca.is_masked()) self.assertEqual(ma.count(testvec), 2) self.gca.remove_mask() self.assertFalse(self.gca.is_masked()) testvec = self.gca.compress(self.grid) self.assertEqual(ma.count(testvec), 4)
def ReturnIs(marray, xc, yc, rbins, oversamp, total=0): """Returns the average quantities at different radius of a masked array. total=1 just return a total count within rbins""" SizeY = marray.shape[0] # Y size SizeX = marray.shape[1] # X size x = np.reshape(np.arange(SizeX * SizeY), (SizeY, SizeX)) % SizeX x = x.astype(np.float32) x /= oversamp y = np.reshape(np.arange(SizeX * SizeY), (SizeY, SizeX)) / SizeX y = y.astype(np.float32) y /= oversamp rx = (x - xc)* self.co + (y - yc) * self.si ry = (xc - x) * self.si + (y - yc) * self.co r = np.sqrt(rx**2.0 + ry**2.0 / self.one_minus_eg_sq) if total: con = (r < rbins) TotI = marray[con].sum() TotN = ma.count(marray[con]) / (oversamp * oversamp * 1.0) return TotI, TotN else: AvgIAtR = [] AvgIInR = [] IInRArr = [] RArr = [] NInRArr = [] letbreak = 0 # this will be used to break the loop if eta is # less than 0.2 for 20 ri's for ri in rbins: con = (r > ri - 1/oversamp) & (r < ri + 1/oversamp) IAtR = marray[con].sum() NAtR = ma.count(marray[con]) * 1.0 con = (r < ri) IInR = marray[con].sum() NInR = ma.count(marray[con]) * 1.0 if NAtR == 0 or NInR == 0 or ri > 20 and NAtR < 30 or \ ri > 20 and NInR < 30: pass else: AvgIAtR.append(IAtR / NAtR) AvgIInR.append(IInR / NInR) IInRArr.append(IInR) RArr.append(ri) NInRArr.append(NInR) if IAtR * NInR / (NAtR * IInR) < 0.2: letbreak += 1 if letbreak > 20: break AvgIAtR = np.asarray(AvgIAtR) AvgIInR = np.asarray(AvgIInR) IInRArr = np.asarray(IInRArr) RArr = np.asarray(RArr) NInRArr = np.asarray(NInRArr) / (oversamp * oversamp * 1.0) return AvgIAtR, AvgIInR, IInRArr, RArr, NInRArr
def get_dims(squaremask): """ return number of unmasked pixels along horizontal and vertical profiles through the center of the image (=size of mask) """ dimx,dimy = np.shape(squaremask) halfpointval = squaremask[dimx/2,dimy/2] horvec = squaremask[dimx/2,:] vervec = squaremask[:,dimy/2] return ma.count(horvec), ma.count(vervec)
def get_dims(squaremask): """ return number of unmasked pixels along horizontal and vertical profiles through the center of the image (=size of mask) """ dimx, dimy = np.shape(squaremask) halfpointval = squaremask[dimx / 2, dimy / 2] horvec = squaremask[dimx / 2, :] vervec = squaremask[:, dimy / 2] return ma.count(horvec), ma.count(vervec)
def calc_bulk_stats(stats_found, num_pts_section): if (stats_found==1): ice_area = ma.count(elevation2d)*(xy_res**2) ridge_area_all = ma.count(elevation2d_ridge_ma)*(xy_res**2) mean_ridge_height_all = np.mean(elevation2d_ridge_ma) - level_elev mean_ridge_heightL = np.mean(ridge_height_mesh) ridge_areaL = ma.count(ridge_height_mesh)*(xy_res**2) return [mean_x, mean_y, ice_area, num_ridges, ridge_area_all, ridge_areaL, mean_ridge_height_all, mean_ridge_heightL, mean_alt, mean_pitch, mean_roll, mean_vel, num_pts_section, stats_found] elif (stats_found==0): #a = ma.masked_all((0)) #masked_val = mean(a) return [mean_x, mean_y, -999, 0,-999, -999, -999, -999, mean_alt, mean_pitch, mean_roll, mean_vel, num_pts_section, stats_found]
def _target_recognize_algorithm(self, debug=False): if isinstance(self._open_cv, OpenCV): if debug == True: self._open_cv.capture_picture( '/home/dane/Downloads/20180308_092236.mp4') else: self._open_cv.capture_picture(1) self._open_cv.convert_to_black_and_white() if debug: #resize width, height = self._open_cv.get_image_dimension() self._open_cv.resize_image(None, height / 1.2, width / 1.2) #make black and white #self._open_cv.calculate_level() image = self._open_cv.calculate_threshold() #self._open_cv.save_image(image=image) #countour calc self._open_cv.find_contours() approx_poly_dp = self._open_cv.get_all_approx_poly_dp() centerX = [] centerY = [] med = 0 for approx_poly in approx_poly_dp: ma = self._open_cv.get_moments(image) image = self._open_cv.draw_contour(approx_poly) M = cv2.moments(approx_poly) cX = int(M["m10"] / M["m00"]) cY = int(M["m01"] / M["m00"]) centerX.append(cX) centerY.append(cY) #print("Center: " + str(M["m10"]) + " " + str(M["m00"]) + " " + str(M)) meanX = median(centerX) meanY = median(centerY) finalX = [] finalY = [] for x in centerX: if meanX * 0.85 < x < meanX * 1.15: finalX.append(x) for y in centerY: if meanY * 0.85 < y < meanY * 1.15: finalY.append(y) if count(finalX) == count(finalY) >= 4: cv2.circle(image, (int(meanX), int(meanY)), 7, (120, 130, 120), -1) self._open_cv.save_image(image=image) print("TargetRecognized") return True return False
def test_reset_clip_box(self): """test that we can define a different clip box once set""" self.gca.set_clip_box(-90, -82, -180, -173) testvec = self.gca.compress(self.grid) self.assertTrue(self.gca.is_masked()) testmask = self.gca.get_vec_mask() self.assertEqual(np.count_nonzero(testmask), 3) self.assertEqual(ma.count(testvec), 1) self.gca.set_clip_box(-90, -75, -180, -165) testvec = self.gca.compress(self.grid) self.assertTrue(self.gca.is_masked()) self.assertEqual(ma.count(testvec), 2) testmask = self.gca.get_vec_mask() self.assertEqual(np.count_nonzero(testmask), 2) self.assertEqual(ma.count(testvec), 2)
def __init__(self, MetricTable): # Create empty ratio table nprobs = MetricTable.nprobs nsolvs = MetricTable.nsolvs self.ratios = ma.masked_array(1.0 * ma.zeros((nprobs + 1, nsolvs))) # Compute best relative performance ratios across # solvers for each problem for prob in range(nprobs): metrics = MetricTable.prob_mets(prob) best_met = ma.minimum(metrics) if (ma.count(metrics) == nsolvs and ma.maximum(metrics) <= opts.minlimit): self.ratios[prob + 1, :] = 1.0 else: self.ratios[prob + 1, :] = metrics * (1.0 / best_met) # Sort each solvers performance ratios for solv in range(nsolvs): self.ratios[:, solv] = ma.sort(self.ratios[:, solv]) # Compute largest ratio and use to replace failures entries self.maxrat = ma.maximum(self.ratios) self.ratios = ma.filled(self.ratios, 1.01 * self.maxrat)
def __init__(self, MetricTable): # Create empty ratio table nprobs = MetricTable.nprobs nsolvs = MetricTable.nsolvs self.ratios = ma.masked_array(1.0 * ma.zeros((nprobs+1, nsolvs))) # Compute best relative performance ratios across # solvers for each problem for prob in range(nprobs): metrics = MetricTable.prob_mets(prob) best_met = ma.minimum(metrics) if (ma.count(metrics)==nsolvs and ma.maximum(metrics)<=opts.minlimit): self.ratios[prob+1,:] = 1.0; else: self.ratios[prob+1,:] = metrics * (1.0 / best_met) # Sort each solvers performance ratios for solv in range(nsolvs): self.ratios[:,solv] = ma.sort(self.ratios[:,solv]) # Compute largest ratio and use to replace failures entries self.maxrat = ma.maximum(self.ratios) self.ratios = ma.filled(self.ratios, 1.01 * self.maxrat)
def _pivot_col(T, tol=1.0E-12, bland=False): """ Given a linear programming simplex tableau, determine the column of the variable to enter the basis. Parameters ---------- T : 2D ndarray The simplex tableau. tol : float Elements in the objective row larger than -tol will not be considered for pivoting. Nominally this value is zero, but numerical issues cause a tolerance about zero to be necessary. bland : bool If True, use Bland's rule for selection of the column (select the first column with a negative coefficient in the objective row, regardless of magnitude). Returns ------- status: bool True if a suitable pivot column was found, otherwise False. A return of False indicates that the linear programming simplex algorithm is complete. col: int The index of the column of the pivot element. If status is False, col will be returned as nan. """ ma = np.ma.masked_where(T[-1, :-1] >= -tol, T[-1, :-1], copy=False) if ma.count() == 0: return False, np.nan if bland: return True, np.where(ma.mask == False)[0][0] return True, np.ma.where(ma == ma.min())[0][0]
def get_array_attributes(self): lat_end = 45 lon_end = 180 ### 5 day data or daily data time_end = 730 # time_end = 3650 ### Choose array (decadel mean, annual mean or all data) ### All Data self.what_data = "AllData" self.array = load_cflux_masked.load_file(time_end=time_end, lat_end=lat_end, lon_end=lon_end) ### decadel mean # ~ self.what_data = 'DecadalMean' # ~ self.array = ma.mean(self.array, axis=0) # ~ self.array = np.reshape(self.array, (1, lat_end, lon_end)) ### annual mean self.what_data = "AnnualCycle" self.array = ma.mean(np.split(self.array, 10, axis=0), axis=0) ### self.array_shape = np.shape(self.array) print self.array_shape self.count_non_masked = ma.count(self.array) self.time_len = self.array_shape[0] # need to set interpolated and masked array time_end to be equal NB!!! self.lat_len = self.array_shape[1] self.lon_len = self.array_shape[2] self.string_length = len(bin(self.count_non_masked)[2:]) for item in itertools.product(range(self.lat_len), range(self.lon_len)): self.actual_data_dict[item] = np.std(self.array[:, item[0], item[1]])
def add_chunk(self, chunk): if self.masked: ma.sum(chunk, axis=self.axis, out=self.temp) self.running_total += self.temp.filled(0) self.running_count += ma.count(chunk, axis=self.axis) else: np.sum(chunk, axis=self.axis, out=self.temp) self.running_total += self.temp
def test_ll_corner(self): """test that we filter out everything but ll corner""" self.gca.set_clip_box(-90, -82, -180, -173) testvec = self.gca.compress(self.grid) self.assertTrue(self.gca.is_masked()) testmask = self.gca.get_vec_mask() self.assertEqual(np.count_nonzero(testmask), 3) self.assertEqual(ma.count(testvec), 1)
def test_ll_corner(self) : """test that we filter out everything but ll corner""" self.gca.set_clip_box(-90,-82,-180,-173) testvec = self.gca.compress(self.grid) self.assertTrue(self.gca.is_masked()) testmask = self.gca.get_vec_mask() self.assertEqual(np.count_nonzero(testmask), 3) self.assertEqual(ma.count(testvec), 1)
def test_xtestCount(self): # Test count ott = array([0., 1., 2., 3.], mask=[1, 0, 0, 0]) assert_(count(ott).dtype.type is np.intp) assert_equal(3, count(ott)) assert_equal(1, count(1)) assert_(eq(0, array(1, mask=[1]))) ott = ott.reshape((2, 2)) assert_(count(ott).dtype.type is np.intp) assert_(isinstance(count(ott, 0), np.ndarray)) assert_(count(ott).dtype.type is np.intp) assert_(eq(3, count(ott))) assert_(getmask(count(ott, 0)) is nomask) assert_(eq([1, 2], count(ott, 0)))
def test_testAverage2(self): # More tests of average. w1 = [0, 1, 1, 1, 1, 0] w2 = [[0, 1, 1, 1, 1, 0], [1, 0, 0, 0, 0, 1]] x = arange(6) assert_(allclose(average(x, axis=0), 2.5)) assert_(allclose(average(x, axis=0, weights=w1), 2.5)) y = array([arange(6), 2.0 * arange(6)]) assert_(allclose(average(y, None), np.add.reduce(np.arange(6)) * 3. / 12.)) assert_(allclose(average(y, axis=0), np.arange(6) * 3. / 2.)) assert_(allclose(average(y, axis=1), [average(x, axis=0), average(x, axis=0)*2.0])) assert_(allclose(average(y, None, weights=w2), 20. / 6.)) assert_(allclose(average(y, axis=0, weights=w2), [0., 1., 2., 3., 4., 10.])) assert_(allclose(average(y, axis=1), [average(x, axis=0), average(x, axis=0)*2.0])) m1 = zeros(6) m2 = [0, 0, 1, 1, 0, 0] m3 = [[0, 0, 1, 1, 0, 0], [0, 1, 1, 1, 1, 0]] m4 = ones(6) m5 = [0, 1, 1, 1, 1, 1] assert_(allclose(average(masked_array(x, m1), axis=0), 2.5)) assert_(allclose(average(masked_array(x, m2), axis=0), 2.5)) assert_(average(masked_array(x, m4), axis=0) is masked) assert_equal(average(masked_array(x, m5), axis=0), 0.0) assert_equal(count(average(masked_array(x, m4), axis=0)), 0) z = masked_array(y, m3) assert_(allclose(average(z, None), 20. / 6.)) assert_(allclose(average(z, axis=0), [0., 1., 99., 99., 4.0, 7.5])) assert_(allclose(average(z, axis=1), [2.5, 5.0])) assert_(allclose(average(z, axis=0, weights=w2), [0., 1., 99., 99., 4.0, 10.0])) a = arange(6) b = arange(6) * 3 r1, w1 = average([[a, b], [b, a]], axis=1, returned=True) assert_equal(shape(r1), shape(w1)) assert_equal(r1.shape, w1.shape) r2, w2 = average(ones((2, 2, 3)), axis=0, weights=[3, 1], returned=True) assert_equal(shape(w2), shape(r2)) r2, w2 = average(ones((2, 2, 3)), returned=True) assert_equal(shape(w2), shape(r2)) r2, w2 = average(ones((2, 2, 3)), weights=ones((2, 2, 3)), returned=True) assert_(shape(w2) == shape(r2)) a2d = array([[1, 2], [0, 4]], float) a2dm = masked_array(a2d, [[0, 0], [1, 0]]) a2da = average(a2d, axis=0) assert_(eq(a2da, [0.5, 3.0])) a2dma = average(a2dm, axis=0) assert_(eq(a2dma, [1.0, 3.0])) a2dma = average(a2dm, axis=None) assert_(eq(a2dma, 7. / 3.)) a2dma = average(a2dm, axis=1) assert_(eq(a2dma, [1.5, 4.0]))
def test_testAverage2(self): # More tests of average. w1 = [0, 1, 1, 1, 1, 0] w2 = [[0, 1, 1, 1, 1, 0], [1, 0, 0, 0, 0, 1]] x = arange(6) assert_(allclose(average(x, axis=0), 2.5)) assert_(allclose(average(x, axis=0, weights=w1), 2.5)) y = array([arange(6), 2.0 * arange(6)]) assert_(allclose(average(y, None), np.add.reduce(np.arange(6)) * 3. / 12.)) assert_(allclose(average(y, axis=0), np.arange(6) * 3. / 2.)) assert_(allclose(average(y, axis=1), [average(x, axis=0), average(x, axis=0)*2.0])) assert_(allclose(average(y, None, weights=w2), 20. / 6.)) assert_(allclose(average(y, axis=0, weights=w2), [0., 1., 2., 3., 4., 10.])) assert_(allclose(average(y, axis=1), [average(x, axis=0), average(x, axis=0)*2.0])) m1 = zeros(6) m2 = [0, 0, 1, 1, 0, 0] m3 = [[0, 0, 1, 1, 0, 0], [0, 1, 1, 1, 1, 0]] m4 = ones(6) m5 = [0, 1, 1, 1, 1, 1] assert_(allclose(average(masked_array(x, m1), axis=0), 2.5)) assert_(allclose(average(masked_array(x, m2), axis=0), 2.5)) assert_(average(masked_array(x, m4), axis=0) is masked) assert_equal(average(masked_array(x, m5), axis=0), 0.0) assert_equal(count(average(masked_array(x, m4), axis=0)), 0) z = masked_array(y, m3) assert_(allclose(average(z, None), 20. / 6.)) assert_(allclose(average(z, axis=0), [0., 1., 99., 99., 4.0, 7.5])) assert_(allclose(average(z, axis=1), [2.5, 5.0])) assert_(allclose(average(z, axis=0, weights=w2), [0., 1., 99., 99., 4.0, 10.0])) a = arange(6) b = arange(6) * 3 r1, w1 = average([[a, b], [b, a]], axis=1, returned=1) assert_equal(shape(r1), shape(w1)) assert_equal(r1.shape, w1.shape) r2, w2 = average(ones((2, 2, 3)), axis=0, weights=[3, 1], returned=1) assert_equal(shape(w2), shape(r2)) r2, w2 = average(ones((2, 2, 3)), returned=1) assert_equal(shape(w2), shape(r2)) r2, w2 = average(ones((2, 2, 3)), weights=ones((2, 2, 3)), returned=1) assert_(shape(w2) == shape(r2)) a2d = array([[1, 2], [0, 4]], float) a2dm = masked_array(a2d, [[0, 0], [1, 0]]) a2da = average(a2d, axis=0) assert_(eq(a2da, [0.5, 3.0])) a2dma = average(a2dm, axis=0) assert_(eq(a2dma, [1.0, 3.0])) a2dma = average(a2dm, axis=None) assert_(eq(a2dma, 7. / 3.)) a2dma = average(a2dm, axis=1) assert_(eq(a2dma, [1.5, 4.0]))
def calcScore(self, attempt=-1): min_nbb = int(self.MeasSettings['MinValidBB']) min_ntb = int(self.MeasSettings['MinValidTB']) bcompvpp = bool(self.MeasSettings['CompensateBBVpp'][0]) bb_raw, bb_average, bb_scores = self.calcBBScore(field='BBVpp', attempt=attempt) tb_raw, tb_average, tb_scores = self.calcTBScore(attempt=attempt) bb_valid = [] if (bcompvpp): self.applyTOFCorrection() bb_scores = [] for ii, c in enumerate(self.Candidates): field = 'BBVpp_' + c cbb_raw, cbb_average, cbb_scores = self.calcBBScore( field=field, attempt=attempt) bb_scores.append(cbb_scores[ii]) if (ma.count(cbb_average) > min_nbb): bb_valid.append(True) else: bb_valid.append(False) if (ma.count(bb_average) > min_nbb): bbvalid = True else: bbvalid = False if (ma.count(tb_average) > min_ntb): tbvalid = True else: tbvalid = False if (tbvalid): tot_scores = (np.asarray(bb_scores) + np.asarray(tb_scores)) / 2. else: tot_scores = np.asarray(bb_scores) # look for highest socre: candidate_index = np.argmax(np.asarray(tot_scores)) if (bbvalid and all(bb_valid)): self.ClassificationResult = self.Candidates[candidate_index] self.Scores = tot_scores out = 'Plate classified as ' + self.ClassificationResult + '.' for ii, c in enumerate(self.Candidates): out += ' ' + c + ' score: ' + str(tot_scores[ii].round(2)) else: out = 'Could not classify plate -- invalid BB' return out
def myfunction(d, mx, mn): from numpy.ma import maximum, minimum, absolute, greater, count try: if count(d) == 0: return mx, mn mx = float(maximum(mx, float(maximum(d)))) mn = float(minimum(mn, float(minimum(d)))) except: for i in d: mx, mn = myfunction(i, mx, mn) return mx, mn
def myfunction(d,mx,mn): from numpy.ma import maximum,minimum,absolute,greater,count try: if count(d)==0 : return mx,mn mx=float(maximum(mx,float(maximum(d)))) mn=float(minimum(mn,float(minimum(d)))) except: for i in d: mx,mn=myfunction(i,mx,mn) return mx,mn
def myfunction(d,mx,mn): from numpy.ma import maximum,minimum,masked_where,absolute,greater,count try: d=masked_where(greater(absolute(d),9.9E19),d) if count(d)==0 : return mx,mn mx=float(maximum(mx,float(maximum(d)))) mn=float(minimum(mn,float(minimum(d)))) except: for i in d: mx,mn=myfunction(i,mx,mn) return mx,mn
def myfunction(d, mx, mn): from numpy.ma import maximum, minimum, masked_where, absolute, greater, count try: d = masked_where(greater(absolute(d), 9.9E19), d) if count(d) == 0: return mx, mn mx = float(maximum(mx, float(maximum(d)))) mn = float(minimum(mn, float(minimum(d)))) except: for i in d: mx, mn = myfunction(i, mx, mn) return mx, mn
def getStatVal(imageFile,longitude,latitude,winsize,statistic,site): """ Caculates the statistics on the pixels in the window array """ band1,band2,band3,band4,band5,band6,count = 'None','None','None','None','None','None','None' if imageFile != 'None' and imageFile != None: imageFile=qvf.changestage(imageFile,'tmp') temp = '%s_%s_%spix.tif' % (imageFile.split('.')[0],site.strip(),winsize) if not os.path.exists(temp): subsetRaster = getWindow(imageFile,longitude,latitude,winsize,site) else: subsetRaster = temp try: imgInfo = gdalcommon.info(subsetRaster) handle = gdal.Open(subsetRaster) for band in [1,2,3,4,5,6]: if handle != None: bandHandle = handle.GetRasterBand(band) bandArray = bandHandle.ReadAsArray() maskedBand = ma.masked_values(bandArray, 0) count = ma.count(maskedBand) if statistic == 'mean': statVal = maskedBand.mean() elif statistic == 'std': statVal = maskedBand.std() else: statVal = None if band == 1: band1 = statVal elif band == 2: band2 = statVal elif band == 3: band3 = statVal elif band == 4: band4 = statVal elif band == 5: band5 = statVal elif band == 6: band6 = statVal except: pass return band1, band2, band3, band4, band5, band6, count
def calc_bulk_stats(stats_found, num_pts_section): if (stats_found == 1): ice_area = ma.count(elevation2d) * (xy_res**2) ridge_area_all = ma.count(elevation2d_ridge_ma) * (xy_res**2) mean_ridge_height_all = np.mean(elevation2d_ridge_ma) - level_elev mean_ridge_heightL = np.mean(ridge_height_mesh) ridge_areaL = ma.count(ridge_height_mesh) * (xy_res**2) return [ mean_x, mean_y, ice_area, num_ridges, ridge_area_all, ridge_areaL, mean_ridge_height_all, mean_ridge_heightL, mean_alt, mean_pitch, mean_roll, mean_vel, num_pts_section, stats_found ] elif (stats_found == 0): #a = ma.masked_all((0)) #masked_val = mean(a) return [ mean_x, mean_y, -999, 0, -999, -999, -999, -999, mean_alt, mean_pitch, mean_roll, mean_vel, num_pts_section, stats_found ]
def test_reset_clip_box(self) : """test that we can define a different clip box once set""" self.gca.set_clip_box(-90,-82,-180,-173) testvec = self.gca.compress(self.grid) self.assertTrue(self.gca.is_masked()) testmask = self.gca.get_vec_mask() self.assertEqual(np.count_nonzero(testmask), 3) self.assertEqual(ma.count(testvec), 1) self.gca.set_clip_box(-90,-75,-180,-165) testvec = self.gca.compress(self.grid) self.assertTrue(self.gca.is_masked()) self.assertEqual(ma.count(testvec), 2) testmask = self.gca.get_vec_mask() self.assertEqual(np.count_nonzero(testmask), 2) self.assertEqual(ma.count(testvec), 2)
def filter_stripes(self, variable: str) -> None: """Filters vertical and horizontal stripe-shaped artifacts from radar data.""" if variable not in self.data: return data = self.data[variable][:] n_points_in_profiles = ma.count(data, axis=1) n_profiles_with_data = np.count_nonzero(n_points_in_profiles) if n_profiles_with_data < 300: return n_vertical = self._filter(data, 1, min_coverage=0.5, z_limit=10, distance=4, n_blocks=100) n_horizontal = self._filter(data, 0, min_coverage=0.3, z_limit=-30, distance=3, n_blocks=20) logging.info(f'Filtered {n_vertical} vertical and {n_horizontal} horizontal stripes from ' f'radar data using {variable}')
def test_testBasic1d(self): # Test of basic array creation and properties in 1 dimension. (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d assert_(not isMaskedArray(x)) assert_(isMaskedArray(xm)) assert_equal(shape(xm), s) assert_equal(xm.shape, s) assert_equal(xm.dtype, x.dtype) assert_equal(xm.size, reduce(lambda x, y:x * y, s)) assert_equal(count(xm), len(m1) - reduce(lambda x, y:x + y, m1)) assert_(eq(xm, xf)) assert_(eq(filled(xm, 1.e20), xf)) assert_(eq(x, xm))
def directionality(self): # Create series to store data self.binDF['self'] = self.binDF['inter'] = self.binDF['up'] = self.binDF['down'] = self.binDF['log2'] = np.nan # Loop through rows of the matrix for rowNo, row in enumerate(self.probMatrix): # Set none values if bin is entirely masked if ma.count(row) == 0: continue # Else calculate values else: # Extract self frequency selflig = row[rowNo] # Extract up frequency up = row[:rowNo] if ma.count(up) == 0: up = 0. else: up = up.sum() # Extract down frequency down = row[rowNo + 1:] if ma.count(down) == 0: down = 0. else: down = down.sum() # Calculate inter value inter = 1 - selflig - up - down # Calculate log2 value if up == 0: if down == 0: log2 = np.nan else: log2 = -np.inf elif down == 0: log2 = np.inf else: log2 = np.log2(up/down) # Store results self.binDF.loc[rowNo,['self','inter','up','down','log2']] = ( selflig, inter, up, down, log2)
def maskImageStats(mimage): n = ma.count(mimage) mimagesq = mimage * mimage sum1 = ma.sum(mimage) sum2 = ma.sum(sum1) sumsq1 = ma.sum(mimagesq) sumsq2 = ma.sum(sumsq1) avg = sum2 / n if (n > 1): stdev = math.sqrt((sumsq2 - sum2 * sum2 / n) / (n - 1)) else: stdev = 2e20 return n, avg, stdev
def maskImageStats(mimage): n=ma.count(mimage) mimagesq=mimage*mimage sum1=ma.sum(mimage) sum2=ma.sum(sum1) sumsq1=ma.sum(mimagesq) sumsq2=ma.sum(sumsq1) avg=sum2/n if (n > 1): stdev=math.sqrt((sumsq2-sum2*sum2/n)/(n-1)) else: stdev=2e20 return n,avg,stdev
def type(self): i = 0 all_content = json.loads(files.basic_file_read(self.file)) print("com=", self.comboBox.currentIndex()) list_all = list(all_content['student'].values()) # 字典转换为列表 # 全部查找 if self.comboBox.currentIndex() == 0: self.tableWidget.setRowCount(count(list(all_content['student'].keys()))) for key in all_content['student'].keys(): self.write(i, all_content, key) i += 1 self.hint('') pass # 按学号查找 if self.comboBox.currentIndex() == 1: self.tableWidget.setRowCount(sum(item['Sno'] == self.lineEdit.text() for item in list_all)) #计算符合条件的个数以增加行数 for key in all_content['student'].keys(): if all_content['student'][key]['Sno'] == self.lineEdit.text(): self.write(i, all_content, key) i += 1 self.hint("学号") pass # 按姓名查找 if self.comboBox.currentIndex() == 2: self.tableWidget.setRowCount(sum(key == self.lineEdit.text() for key in all_content['student'].keys())) #计算符合条件的个数以增加行数 for key in all_content['student'].keys(): if key == self.lineEdit.text(): self.write(i, all_content, key) i += 1 self.hint("姓名") pass # 按专业查找 if self.comboBox.currentIndex() == 3: self.tableWidget.setRowCount(sum(item['is'] == self.lineEdit.text() for item in list_all)) #计算符合条件的个数以增加行数 for key in all_content['student'].keys(): if all_content['student'][key]['is'] == self.lineEdit.text(): self.write(i, all_content, key) i += 1 self.hint("专业") pass # 按课程名查找 if self.comboBox.currentIndex() == 4: self.tableWidget.setRowCount(sum(item['Course'] == self.lineEdit.text() for item in list_all)) # 计算符合条件的个数以增加行数 for key in all_content['student'].keys(): if all_content['student'][key]['Course'] == self.lineEdit.text(): self.write(i, all_content, key) i += 1 self.hint("课程名") pass
def __forward__( self, j): #,ngb_rating, ngb_weight, m_bias, ngb_m_bias, ngb_n_bias): _rating = self._rating[:, j] _ngb_rating = _rating[self._neighborhood.flat].reshape((self._m, -1)) _ngb_m_bias = self._m_bias[self._neighborhood.flat].reshape( (self._m, -1)) _ngb_n_bias = self._n_bias[j] _adjust_rating = _ngb_rating - _ngb_m_bias - _ngb_n_bias _adjust_factor = np.sqrt(ma.count(_ngb_rating, axis=1)) _hat_rating = self._m_bias + _ngb_n_bias + ma.sum( self._weight * _adjust_rating, axis=1) / _adjust_factor return _hat_rating, (_adjust_rating, _adjust_factor, _ngb_rating, _ngb_m_bias, _ngb_n_bias)
def _filter( self, data: np.ndarray, axis: int, min_coverage: float, z_limit: float, distance: float, n_blocks: int, ) -> int: if axis == 0: data = data.T echo = self.data["Z"][:].T else: echo = self.data["Z"][:] len_block = int(np.floor(data.shape[0] / n_blocks)) block_indices = np.arange(len_block) n_removed_total = 0 for block_number in range(n_blocks): data_block = data[block_indices, :] n_values = ma.count(data_block, axis=1) try: q1 = np.quantile(n_values, 0.25) q3 = np.quantile(n_values, 0.75) except IndexError: continue threshold = distance * (q3 - q1) + q3 indices = np.where( (n_values > threshold) & (n_values > (min_coverage * data.shape[1])) )[0] true_ind = [int(x) for x in (block_number * len_block + indices)] n_removed = len(indices) if n_removed > 5: continue if n_removed > 0: n_removed_total += n_removed for ind in true_ind: ind2 = np.where(echo[ind, :] < z_limit) bad_indices = (ind, ind2) if axis == 1 else (ind2, ind) self.data["v"][:][bad_indices] = ma.masked block_indices += len_block return n_removed_total
def test_set_window(self): window_data = np.ones((self.lat_size_win, self.lon_size_win)) x = self.w.set_window(window_data) # check output geometry self.assertEqual(x.shape[0], 360) self.assertEqual(x.shape[1], 720) # check output is masked self.assertTrue(ma.is_masked(x)) # check that the window is only thing in returned array win_masked = ma.count_masked(x) win = ma.count(x) self.assertEqual(win, window_data.size) self.assertEqual(win_masked, x.size - window_data.size) self.assertTrue(np.all(x[self.w._window] == window_data))
def test_set_window(self) : window_data = np.ones( (self.lat_size_win, self.lon_size_win) ) x = self.w.set_window(window_data) # check output geometry self.assertEqual(x.shape[0], 360) self.assertEqual(x.shape[1], 720) # check output is masked self.assertTrue(ma.is_masked(x)) # check that the window is only thing in returned array win_masked = ma.count_masked(x) win = ma.count(x) self.assertEqual(win, window_data.size) self.assertEqual(win_masked, x.size - window_data.size) self.assertTrue(np.all(x[self.w._window] == window_data))
def corr_proba(r, ndata, ndataset=2, dof=False): """Probability of rejecting correlations - **r**: Correlation coefficient - **ndata**: Number of records use for correlations - **ndataset**, optional: Number of datasets (1 for autocorrelations, else 2) [default: 2] .. todo:: This must be rewritten using :mod:`scipy.stats` """ # TODO: use scipy for betai and _gamma? from genutil.salstat import betai,_gammaln # Basic tests ndata = MA.masked_equal(ndata,0,copy=0) r = MV2.masked_where(MA.equal(MA.absolute(r),1.),r,copy=0) # Degree of freedom if dof: df = ndata else: df = ndata-2-ndataset # Advanced test: prevent extreme values by locally decreasing the dof reduc = N.ones(r.shape) z = None while z is None or MA.count(MA.masked_greater(z,-600.)): if z is not None: imax = MA.argmin(z.ravel()) reduc.flat[imax] += 1 dfr = df/reduc t = r*MV2.sqrt(dfr/((1.0-r)* (1.0+r))) a = 0.5*dfr b = 0.5 x = df/(dfr+t**2) z = _gammaln(a+b)-_gammaln(a)-_gammaln(b)+a*MA.log(x)+b*MA.log(1.0-x) # Perfom the test and format the variable prob = MV2.masked_array(betai(a,b,x),axes=r.getAxisList())*100 prob.id = 'corr_proba' ; prob.name = prob.id prob.long_name = 'Probability of rejection' prob.units = '%' return prob
def _collapseStack(self,stack=None,ustack=None,method='SigClip',sig=50.): ''' If called without the stack keyword set, this will collapse the entire stack. However, the internal stack is overridden if a different stack is passed. For instance, this could be a stack of nod pairs. ''' if stack is None: stack,ustack = self.stack,self.ustack #stack_median = np.median(stack,2) #stack_stddev = np.std(stack,2) #shape = stack.shape #masked_stack = ma.zeros(shape) masked_stack = ma.masked_invalid(stack) masked_ustack = ma.masked_invalid(ustack) image = ma.average(masked_stack,2,weights=1./masked_ustack**2) uimage = np.sqrt(ma.mean(masked_ustack**2,2)/ma.count(masked_ustack,2)) return image, uimage
def find_best_baseline(masked,xx): """ Consider polynomial baselines up to order 7 (which seems to do a good job for the most complex baselines). Select the baseline with the lowest reduced chi-squared, where we have added an extra penalty for increasing more degrees of freedom (prior_penalty). Without a prior, prior_penalty would be 1. """ prior_penalty = 10. chisqs = np.zeros(7) ndegs = np.arange(7) for i,ndeg in enumerate(ndegs): basepoly = fit_baseline(masked,xx,ndeg=ndeg) base = basepoly(xx) chisqs[i] = np.sum((masked-base)**2)/(ma.count(masked) -1-prior_penalty*ndeg) return(np.argmin(chisqs))
def climo(time, series): styr = time[0,0] # get the first year series_orig = ma.masked_equal(series, -999)# create a masked array so can keep track of orginial entries idx = np.asarray(np.where(time[:,0]==styr)[0]) av =[0]*len(np.asarray(idx)) # want an array to hold the averages ms =[0]*len(np.asarray(idx)) # and another to hold the info of how many missing numbers yr1 = time[np.asarray(idx),:]# all time data from the first year for elem in idx: midx =np.asarray(np.where(time[:,1] == yr1[elem,1])[0])#all the indexes which are in month = 1 didx = np.asarray(np.where(time[midx,2] ==yr1[elem,2])[0])# all the indexes of these which are month = 1, day = 1 hidx = np.asarray(np.where(time[didx,3] == yr1[elem,3])[0])# month = 1 day = 1 hour = 0 x = np.asarray(np.where(time[hidx,4] ==yr1[elem,4])[0]) # exactly the same time stamp d= midx[didx[hidx[x]]]# index of full series where the date/time is the same ms[elem] = ma.count(series_orig[d]) # number of missing values av[elem] = np.mean(series_orig[d]) #average of non missing values return av, ms, yr1
def execute(self, nprocesses=1) : params = self.params model = params["model"] kiyopy.utils.mkparents(params['output_root']) parse_ini.write_params(params, params['output_root'] + 'params.ini', prefix=prefix) # Loop over files to process. for file_middle in params['file_middles'] : input_fname = (params['input_root'] + file_middle + params['input_end']) Reader = core.fitsGBT.Reader(input_fname, feedback=self.feedback) output_fname = params["output_root"] + file_middle + ".npy" if model == "scan_var" : n_scans = len(Reader.scan_set) n_IFs = len(Reader.IF_set) first_block = True for jj in range(n_IFs) : # These all become arrays on the first iteration. var = 0.0 mean = 0.0 counts = 0 for ii in range(n_scans) : Data = Reader.read(ii, jj) if first_block : out_shape = (n_IFs,) + Data.dims[1:] out_arr = sp.empty(out_shape, dtype=float) first_block = False var += ma.sum(Data.data**2, 0).filled(0) mean += ma.sum(Data.data, 0).filled(0) counts += ma.count(Data.data, 0) # If we didn't get at least 5 good hits, throw aways the # scan. counts[counts < 5] = -1 var = var/counts - (mean/counts)**2 var[counts < 5] = 1.0e10 out_arr[jj, ...] = var sp.save(output_fname, out_arr) if self.feedback > 1 : print ("Wrote noise parameters to file: " + utils.abbreviate_file_path(output_fname)) else : raise ValueError("Invalid noise model: " + model)
def corr_proba(r, ndata, ndataset=2, dof=False): """Probability of rejecting correlations - **r**: Correlation coefficient - **ndata**: Number of records use for correlations - **ndataset**, optional: Number of datasets (1 for autocorrelations, else 2) [default: 2] .. todo:: This must be rewritten using :mod:`scipy.stats` """ # Basic tests ndata = MA.masked_equal(ndata,0,copy=0) r = MV2.masked_where(MA.equal(MA.absolute(r),1.),r,copy=0) # Degree of freedom if dof: df = ndata else: df = ndata-2-ndataset # Advanced test: prevent extreme values by locally decreasing the dof reduc = N.ones(r.shape) z = None while z is None or MA.count(MA.masked_greater(z,-600.)): if z is not None: imax = MA.argmin(z.ravel()) reduc.flat[imax] += 1 dfr = df/reduc t = r*MV2.sqrt(dfr/((1.0-r)* (1.0+r))) a = 0.5*dfr b = 0.5 x = df/(dfr+t**2) z = _gammaln(a+b)-_gammaln(a)-_gammaln(b)+a*MA.log(x)+b*MA.log(1.0-x) # Perfom the test and format the variable prob = MV2.masked_array(betai(a,b,x),axes=r.getAxisList())*100 prob.id = 'corr_proba' ; prob.name = prob.id prob.long_name = 'Probability of rejection' prob.units = '%' return prob
def test_testBasic2d(self): # Test of basic array creation and properties in 2 dimensions. for s in [(4, 3), (6, 2)]: (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d x.shape = s y.shape = s xm.shape = s ym.shape = s xf.shape = s self.assertFalse(isMaskedArray(x)) self.assertTrue(isMaskedArray(xm)) self.assertEqual(shape(xm), s) self.assertEqual(xm.shape, s) self.assertEqual(xm.size, reduce(lambda x, y: x * y, s)) self.assertEqual(count(xm), len(m1) - reduce(lambda x, y: x + y, m1)) self.assertTrue(eq(xm, xf)) self.assertTrue(eq(filled(xm, 1.0e20), xf)) self.assertTrue(eq(x, xm)) self.setUp()
def test_testBasic2d(self): # Test of basic array creation and properties in 2 dimensions. for s in [(4, 3), (6, 2)]: (x, y, a10, m1, m2, xm, ym, z, zm, xf, s) = self.d x.shape = s y.shape = s xm.shape = s ym.shape = s xf.shape = s assert_(not isMaskedArray(x)) assert_(isMaskedArray(xm)) assert_equal(shape(xm), s) assert_equal(xm.shape, s) assert_equal(xm.size, reduce(lambda x, y: x * y, s)) assert_equal(count(xm), len(m1) - reduce(lambda x, y: x + y, m1)) assert_(eq(xm, xf)) assert_(eq(filled(xm, 1.e20), xf)) assert_(eq(x, xm)) self.setup()
def load_gofr(fn): data = np.loadtxt(fn_prot) print data.shape x = data[::, 0] * 0.1 gr = data[::, 1] gr = gr / gr[-1] c = ma.masked_less_equal(gr, 0) count_zeros = ma.count(c) # print count_zeros c = ma.compressed(c) y = -0.6 * np.log(c) # plt.plot(x,gr,'r-') print plot_type if plot_type == 'gofr': return x, gr elif plot_type == 'rdf': # print y # print x.shape - count_zeros return x[x.shape - count_zeros::], y
def regridToCoarse(fine,fac,mode,missValue): nr,nc = np.shape(fine) coarse = np.zeros(nr/fac * nc / fac).reshape(nr/fac,nc/fac) + MV nr,nc = np.shape(coarse) for r in range(0,nr): for c in range(0,nc): ar = fine[r * fac : fac * (r+1),c * fac: fac * (c+1)] m = np.ma.masked_values(ar,missValue) if ma.count(m) == 0: coarse[r,c] = MV else: if mode == 'average': coarse [r,c] = ma.average(m) elif mode == 'median': coarse [r,c] = ma.median(m) elif mode == 'sum': coarse [r,c] = ma.sum(m) elif mode =='min': coarse [r,c] = ma.min(m) elif mode == 'max': coarse [r,c] = ma.max(m) return coarse
def regridToCoarse(fine, fac, mode, missValue=MV, window_extension=0): nr, nc = np.shape(fine) coarse = np.zeros(nr / fac * nc / fac).reshape(nr / fac, nc / fac) + MV nr, nc = np.shape(coarse) for r in range(0, nr): for c in range(0, nc): ar = fine[r * fac:fac * (r + 1), c * fac:fac * (c + 1)] if window_extension > 0: min_r = max(0, r * fac - window_extension) min_c = max(0, c * fac - window_extension) max_r = min(fac * (r + 1) + window_extension, np.shape(fine)[0]) max_c = min(fac * (c + 1) + window_extension, np.shape(fine)[1]) ar = fine[min_r:max_r, min_c:max_c] m = np.ma.masked_values(ar, missValue) if ma.count(m) == 0: coarse[r, c] = MV else: if mode == 'average': coarse[r, c] = ma.average(m) elif mode == 'median': coarse[r, c] = ma.median(m) elif mode == 'sum': coarse[r, c] = ma.sum(m) elif mode == 'min': coarse[r, c] = ma.min(m) elif mode == 'max': coarse[r, c] = ma.max(m) elif mode == 'std': coarse[r, c] = ma.std(m) return coarse
def get_integrated_intensity(input_spectrum, noise_estimate, downsample_fact=1.): """ Calculate the integrated intensity. Calculate the integrated intensity of an input spectrum of the full range of that spectrum. Input should be masked to perform the calculation over the relevant portion only. An estimate for the error is returned as well. This is most accurate if called with a pre-calculated noise_estimate, as one generally cannot calculate the noise in the spectrum from the signal-only portion of the spectrum. downsample_fact = amount by which the spectrum has been downsampled (since mom0 here is in units of channels). """ mom0 = ma.sum(input_spectrum) num_channels = ma.count(input_spectrum) mom0_err = np.sqrt(num_channels) * noise_estimate return (mom0 * downsample_fact, mom0_err * downsample_fact)
def _collapseStack(self, stack=None, ustack=None, method='SigClip', sig=50.): ''' If called without the stack keyword set, this will collapse the entire stack. However, the internal stack is overridden if a different stack is passed. For instance, this could be a stack of nod pairs. ''' if stack is None: stack, ustack = self.stack, self.ustack #stack_median = np.median(stack,2) #stack_stddev = np.std(stack,2) #shape = stack.shape #masked_stack = ma.zeros(shape) masked_stack = ma.masked_invalid(stack) masked_ustack = ma.masked_invalid(ustack) image = ma.average(masked_stack, 2, weights=1. / masked_ustack**2) uimage = np.sqrt( ma.mean(masked_ustack**2, 2) / ma.count(masked_ustack, 2)) return image, uimage
def _pivot_row(T, pivcol, phase, tol=1.0E-12): """ Given a linear programming simplex tableau, determine the row for the pivot operation. Parameters ---------- T : 2D ndarray The simplex tableau. pivcol : int The index of the pivot column. phase : int The phase of the simplex algorithm (1 or 2). tol : float Elements in the pivot column smaller than tol will not be considered for pivoting. Nominally this value is zero, but numerical issues cause a tolerance about zero to be necessary. Returns ------- status: bool True if a suitable pivot row was found, otherwise False. A return of False indicates that the linear programming problem is unbounded. row: int The index of the row of the pivot element. If status is False, row will be returned as nan. """ if phase == 1: k = 2 else: k = 1 ma = np.ma.masked_where(T[:-k, pivcol] <= tol, T[:-k, pivcol], copy=False) if ma.count() == 0: return False, np.nan mb = np.ma.masked_where(T[:-k, pivcol] <= tol, T[:-k, -1], copy=False) q = mb / ma return True, np.ma.where(q == q.min())[0][0]
def main1(i0b,re,eb,n,i0d,rd,ed,point,bpa,dpa,background): c.parmnames = ( ['i0b','re','eb','n' ,'i0d','rd','ed','point','bpa','dpa','background'] ); c.xc = c.ycenter-1.0 c.yc= c.xcenter - 1.0 ix=c.xcenter iy=c.ycenter parms=np.zeros(11) parms[0]=i0b parms[1]=re parms[2]=eb parms[3]=n parms[4]=i0d parms[5]=rd parms[6]=ed parms[7]=point parms[8]=bpa parms[9]=dpa parms[10]=background valueAt(parms) z= c.galaxy.flat # make 1d array from 2d galaxy array numra.seed(120980)# Set random number seed zerr = np.sqrt(1+0.0*((abs(numra.poisson(z)-z)*c.gain)**2.0+c.rdnoise**2.0)) #zerr = np.sqrt((abs(numra.poisson(z)-z)*c.gain)**2.0+c.rdnoise**2.0) # print zerr validpix = np.where(zerr != 0.0) if (c.use_mask): numpoints = ma.count(c.maskedgalaxy) else: numpoints = (c.nxpts*c.nypts) chi2= np.sum(((z[validpix]-c.model_galaxy.flat[validpix])/zerr[validpix])**2.0)/numpoints print chi2 return np.sum(((z[validpix]-c.model_galaxy.flat[validpix])/zerr[validpix])**2.0)/numpoints
def data_cleaning(): # establish dataframe for both train and test data # data source source_train, source_test = pd.read_csv('train.csv'), pd.read_csv( 'test.csv') print("Read train and test data ") print("Training data (show first 5 rows)", "\n", pdtabulate(source_train.head())) print("Testing data (show first 5 rows)", "\n", pdtabulate(source_test.head())) print("Data cleaning begins") print("Check if there are any missing values") print("Columns in Training data", "\n", source_train.isnull().any()) print("Columns in Testing data", "\n", source_test.isnull().any()) # create dictionary to store the keys in each string type columns train_dictionary_for_feature, test_dictionary_for_feature = {}, {} # store all columns name train_column_name, test_column_name = source_train.columns, source_test.columns print("Columns in training data", train_column_name) print("Columns in testing data", test_column_name) # check difference difference_columns_name = list( list(set(train_column_name) - set(test_column_name)) + list(set(test_column_name) - set(train_column_name))) print("Check difference between two list of columns names", "\n", difference_columns_name) # store all feature name and label name feature_name, label_name = test_column_name, difference_columns_name print("Feature columns:", "\n", feature_name) print("Label columns", "\n", label_name) # check data type print("Checking data type") print("The data type of each columns in source_train", "\n", source_train.dtypes) print("The data type of each columns in source_test", "\n", source_test.dtypes) # separate string and integers into different dataframe train_int, train_obj = source_train.select_dtypes( include='int64'), source_train.select_dtypes(include='object') test_int, test_obj = source_test.select_dtypes( include='int64'), source_test.select_dtypes(include='object') print("Separate different data type ") print("The integer columns in source_train (show first 5 rows)", "\n", pdtabulate(train_int.head())) print("The string columns in source_train (show first 5 rows)", "\n", pdtabulate(train_obj.head())) print("The integer columns in source_test (show first 5 rows)", "\n", pdtabulate(test_int.head())) print("The string columns in source_test (show first 5 rows)", "\n", pdtabulate(test_obj.head())) # store all name of the object-type columns train_obj_name, test_obj_name = train_obj.columns, test_obj.columns # change string to integer for each text columns for i in range(0, count(train_obj_name)): # extract the unique columns values train_columns_unique_value = train_obj[train_obj_name[i]].unique() test_columns_unique_value = test_obj[test_obj_name[i]].unique() # establish a train dictionary (Key:each unique values in columns, Value: continuous numbers) enum_train = enumerate(train_columns_unique_value) train_dict_value = dict((j, i) for i, j in enum_train) train_dictionary_for_feature[train_obj_name[i]] = train_dict_value # establish a test dictionary (Key:each unique values in columns, Value: continuous numbers) enum_test = enumerate(test_columns_unique_value) test_dict_value = dict((j, i) for i, j in enum_test) test_dictionary_for_feature[test_obj_name[i]] = test_dict_value # change the text to the corresponding numbers train_obj[train_obj_name[i]] = train_obj[train_obj_name[i]].map( train_dict_value) test_obj[test_obj_name[i]] = test_obj[test_obj_name[i]].map( train_dict_value) # Iterate over key / value pairs of parent dictionary print("Nested dictionary for string columns in training data") for key, value in train_dictionary_for_feature.items(): print(key, 'dictionary') # Again iterate over the nested dictionary for sub_key, sub_value in value.items(): print(sub_key, ':', sub_value) print("Nested dictionary for string columns in testing data") for key, value in test_dictionary_for_feature.items(): print(key, 'dictionary') # Again iterate over the nested dictionary for sub_key, sub_value in value.items(): print(sub_key, ':', sub_value) # combine both integer data frame and text data frame to become a pure integer dataframe train_pure_int = pd.concat([train_int, train_obj], axis=1, sort=False).reindex(columns=train_column_name) print("Covert to integer:") print("Training data (show first 5 rows)", "\n", pdtabulate(train_pure_int.head())) test_pure_int = pd.concat([test_int, test_obj], axis=1, sort=False).reindex(columns=test_column_name) print("Testing data (show first 5 rows)", "\n", pdtabulate(test_pure_int.head())) print("Checking if there are any missing values") print("The total amount of missing values in train_pure_int", "\n", train_pure_int.isnull().sum()) print("The total amount of missing values in test_pure_int", "\n", test_pure_int.isnull().sum()) # native-country has some blank data # the reason is the string(s) in this column cannot match the key in temporary dictionary # the value (99) is to be added in these missing values. print( "Verify the amount of key values in both training data and test data ") print("The amount of key values in training data", "\n", len(train_dictionary_for_feature["native-country"])) print("The amount of key values in testing data", "\n", len(test_dictionary_for_feature["native-country"])) missing_series = pd.isnull(test_pure_int["native-country"]) print("The rows with missing values", "\n", pdtabulate(test_pure_int[missing_series])) print("Input missing values to be 99") miss_input = int(99) test_pure_int = test_pure_int.fillna(miss_input) test_dictionary_for_feature[''] = miss_input print("Refreshing...") print(pdtabulate(test_pure_int[missing_series])) print("Check again...") print("The string columns in test_pure_int") print(test_pure_int.isnull().sum()) # Establish source data x_train = train_pure_int.drop(columns="exceeds50K") y_train = train_pure_int.iloc[:, -1:] x_test = test_pure_int print("Overview of x_train (show first 5 rows):", "\n", pdtabulate(x_train.head())) print("Overview of y_train (show first 5 rows):", "\n", pdtabulate(y_train.head())) print("Overview of x_test (show first 5 rows):", "\n", pdtabulate(x_test.head())) print("Data cleaning is completed.") # Bar chart y_train['exceeds50K'].value_counts().plot(kind='bar') print("") print(y_train['exceeds50K'].value_counts()) plt.title("") plt.show() # Histogram print("Histogram is going to be generated") x1 = source_train['relationship'].to_numpy() x2 = source_test['relationship'].to_numpy() plt.hist([x1, x2], 10, label=['Train data', 'Test data']) plt.legend(loc='upper right') plt.title("The relationship ") plt.show() # Scatter print("Scatter is going to be generated") source_train.plot(kind='scatter', x='age', y='capital-gain', color='red') source_test.plot(kind='scatter', x='age', y='capital-gain', color='blue') plt.show() return x_train, y_train, x_test, feature_name, source_test
def test_testOddFeatures(self): # Test of other odd features x = arange(20) x = x.reshape(4, 5) x.flat[5] = 12 assert_(x[1, 0] == 12) z = x + 10j * x assert_(eq(z.real, x)) assert_(eq(z.imag, 10 * x)) assert_(eq((z * conjugate(z)).real, 101 * x * x)) z.imag[...] = 0.0 x = arange(10) x[3] = masked assert_(str(x[3]) == str(masked)) c = x >= 8 assert_(count(where(c, masked, masked)) == 0) assert_(shape(where(c, masked, masked)) == c.shape) z = where(c, x, masked) assert_(z.dtype is x.dtype) assert_(z[3] is masked) assert_(z[4] is masked) assert_(z[7] is masked) assert_(z[8] is not masked) assert_(z[9] is not masked) assert_(eq(x, z)) z = where(c, masked, x) assert_(z.dtype is x.dtype) assert_(z[3] is masked) assert_(z[4] is not masked) assert_(z[7] is not masked) assert_(z[8] is masked) assert_(z[9] is masked) z = masked_where(c, x) assert_(z.dtype is x.dtype) assert_(z[3] is masked) assert_(z[4] is not masked) assert_(z[7] is not masked) assert_(z[8] is masked) assert_(z[9] is masked) assert_(eq(x, z)) x = array([1., 2., 3., 4., 5.]) c = array([1, 1, 1, 0, 0]) x[2] = masked z = where(c, x, -x) assert_(eq(z, [1., 2., 0., -4., -5])) c[0] = masked z = where(c, x, -x) assert_(eq(z, [1., 2., 0., -4., -5])) assert_(z[0] is masked) assert_(z[1] is not masked) assert_(z[2] is masked) assert_(eq(masked_where(greater(x, 2), x), masked_greater(x, 2))) assert_(eq(masked_where(greater_equal(x, 2), x), masked_greater_equal(x, 2))) assert_(eq(masked_where(less(x, 2), x), masked_less(x, 2))) assert_(eq(masked_where(less_equal(x, 2), x), masked_less_equal(x, 2))) assert_(eq(masked_where(not_equal(x, 2), x), masked_not_equal(x, 2))) assert_(eq(masked_where(equal(x, 2), x), masked_equal(x, 2))) assert_(eq(masked_where(not_equal(x, 2), x), masked_not_equal(x, 2))) assert_(eq(masked_inside(list(range(5)), 1, 3), [0, 199, 199, 199, 4])) assert_(eq(masked_outside(list(range(5)), 1, 3), [199, 1, 2, 3, 199])) assert_(eq(masked_inside(array(list(range(5)), mask=[1, 0, 0, 0, 0]), 1, 3).mask, [1, 1, 1, 1, 0])) assert_(eq(masked_outside(array(list(range(5)), mask=[0, 1, 0, 0, 0]), 1, 3).mask, [1, 1, 0, 0, 1])) assert_(eq(masked_equal(array(list(range(5)), mask=[1, 0, 0, 0, 0]), 2).mask, [1, 0, 1, 0, 0])) assert_(eq(masked_not_equal(array([2, 2, 1, 2, 1], mask=[1, 0, 0, 0, 0]), 2).mask, [1, 0, 1, 0, 1])) assert_(eq(masked_where([1, 1, 0, 0, 0], [1, 2, 3, 4, 5]), [99, 99, 3, 4, 5])) atest = ones((10, 10, 10), dtype=np.float32) btest = zeros(atest.shape, MaskType) ctest = masked_where(btest, atest) assert_(eq(atest, ctest)) z = choose(c, (-x, x)) assert_(eq(z, [1., 2., 0., -4., -5])) assert_(z[0] is masked) assert_(z[1] is not masked) assert_(z[2] is masked) x = arange(6) x[5] = masked y = arange(6) * 10 y[2] = masked c = array([1, 1, 1, 0, 0, 0], mask=[1, 0, 0, 0, 0, 0]) cm = c.filled(1) z = where(c, x, y) zm = where(cm, x, y) assert_(eq(z, zm)) assert_(getmask(zm) is nomask) assert_(eq(zm, [0, 1, 2, 30, 40, 50])) z = where(c, masked, 1) assert_(eq(z, [99, 99, 99, 1, 1, 1])) z = where(c, 1, masked) assert_(eq(z, [99, 1, 1, 99, 99, 99]))