def test_mmhash3_int(): assert_equal(murmurhash3_32(3), 847579505) assert_equal(murmurhash3_32(3, seed=0), 847579505) assert_equal(murmurhash3_32(3, seed=42), -1823081949) assert_equal(murmurhash3_32(3, positive=False), 847579505) assert_equal(murmurhash3_32(3, seed=0, positive=False), 847579505) assert_equal(murmurhash3_32(3, seed=42, positive=False), -1823081949) assert_equal(murmurhash3_32(3, positive=True), 847579505) assert_equal(murmurhash3_32(3, seed=0, positive=True), 847579505) assert_equal(murmurhash3_32(3, seed=42, positive=True), 2471885347)
def generate_input_data(source_file): """""" from sklearn.utils.murmurhash import murmurhash3_32 from csv import DictReader new_file = source_file[:-16] + "hashed.txt" fw = open(new_file, mode = 'w') for count, row in enumerate(DictReader(open(source_file, mode='r'))): if source_file == "train_data_preprocessed.csv": fw.write(str(row["click"])) del row["click"] else: assert source_file == "test_data_preprocessed.csv" fw.write(str(count)) for k, value in row.items(): row[k] = murmurhash3_32(str(k) + str(value), positive = True) row_sorted = sorted(row.items(), key=lambda x: x[1]) for k, value in row_sorted: fw.write(" " + str(value) + ':1') fw.write("\n") if count % 1000000 == 0: print("{0} row finished".format(count))
def test_mmhash3_int_array(): rng = np.random.RandomState(42) keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32) keys = keys.reshape((3, 2, 1)) for seed in [0, 42]: expected = np.array([murmurhash3_32(int(k), seed) for k in keys.flat]) expected = expected.reshape(keys.shape) assert_array_equal(murmurhash3_32(keys, seed), expected) for seed in [0, 42]: expected = np.array([murmurhash3_32(k, seed, positive=True) for k in keys.flat]) expected = expected.reshape(keys.shape) assert_array_equal(murmurhash3_32(keys, seed, positive=True), expected)
def test_uniform_distribution(): n_bins, n_samples = 10, 100000 bins = np.zeros(n_bins, dtype=np.float64) for i in range(n_samples): bins[murmurhash3_32(i, positive=True) % n_bins] += 1 means = bins / n_samples expected = np.ones(n_bins) / n_bins assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
def test_uniform_distribution(): n_bins, n_samples = 10, 100000 bins = np.zeros(n_bins, dtype=np.float) for i in range(n_samples): bins[murmurhash3_32(i, positive=True) % n_bins] += 1 means = bins / n_samples expected = np.ones(n_bins) / n_bins assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
def _f(f): if partition_no >= 0: file_partition = murmurhash3_32(f, args.seed) % partitions_count if file_partition != partition_no: return False file_name = f.rsplit(".", 1)[0] file_path = dest_dir + 'large' + '/v1_' + file_name + '.jpg' if args.no_skip: return True if os.path.exists(file_path) and os.path.isfile(file_path): return False return True
def single_text_hash(x: str) -> str: """Get text hash. Args: x: text. Returns: string text hash. """ numhash = murmurhash3_32(x, seed=13) texthash = str(numhash) if numhash > 0 else 'm' + str(abs(numhash)) return texthash
def _make_category(df: DataFrame, cols: Sequence[str]) -> np.ndarray: """Make hash for category interactions. Args: df: Input DataFrame cols: List of columns Returns: Hash np.ndarray. """ res = np.empty((df.shape[0], ), dtype=np.int32) for n, inter in enumerate(zip(*(df[x] for x in cols))): h = murmurhash3_32("_".join(map(str, inter)), seed=42) res[n] = h return res
def make_color(num: int) -> Tuple[int]: """Create a random color based on number. The provided number is passed through the murmur hash function in order to generate bytes which are somewhat apart from each other. The three least significant byte values are taken as r, g, and b. Parameters ---------- num: int number to use as hash key Returns ------- bytes[3] (r, g, b) values """ val = murmurhash3_32(num, positive=True).to_bytes(8, 'little') # color = qg.QColor(val[0], val[1], val[2]) return val[:3]
def play_tracks(vidfile, trackfile, lw=2, color='auto', fontscale=1, fthickness=1, fstart=0, fend=-1, trail=0, trail_sec=False, torigfile=None, tmtfile=None, vout=None, outfmt='MJPG', fps=None, vwidth=None, vheight=None, timestamp=False, dt=True, skipempty=False): """ Play the video from `vidfile` and overlay the bounding boxes and IDs of the tracked animals from `trackfile`. Parameters ---------- vidfile: str Path of video file. trackfile: str Path of track data file. lw: int Line width for drawing color: str If ``auto``, use random color for each animal. Anything but ``auto`` will use red. fontscale: float Scaling for text font fthickness: int Font thickness in OpenCV text display fstart: int Frame # to start with fend: int Last frame # to process (-1 means end of video) trail: int Number of frames or seconds of trail to show before current frame. Thus with `trail`=100, the positions in the past 100 frames will be drawn like a tail behind each animal. trail_sec: bool Whether the `trail` is number of frames or number of seconds. torigfile: str Path to original timestamp file. tmtfile: str Path to motion-tracked timestamp file. vout: str output video file path. outfmt: str output video format fps: float fps of output video vwidth: int Output video width. vheight: int Output video height. timestamp: bool If `True` then show timestamp. dt: bool If `True`, and `timestamp` is also `True`, then show time elapsed from start instead of timestamp. skipempty: bool If `True` skip frames without any track. """ cap = cv2.VideoCapture(vidfile) if not cap.isOpened(): print('Could not open file', vidfile) frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) if trackfile.endswith('.csv'): tracks = pd.read_csv(trackfile) else: tracks = pd.read_hdf(trackfile, 'tracked') timestamps = None if torigfile is not None: torig = pd.read_csv(torigfile) timestamps = torig if tmtfile is not None: tmt = pd.read_csv(tmtfile) timestamps = pd.merge(torig, tmt, left_on='outframe', right_on='inframe') timestamps.drop(['outframe_x', 'timestamp_y', 'inframe_x', 'inframe_y'], axis=1, inplace=True) timestamps.rename({'inframe_x': 'origframe', 'inframe_y': 'inframe', 'timestamp_x': 'timestamp', 'outframe_y': 'frame'}, axis=1, inplace=True) if timestamps is None: tstart = datetime.fromtimestamp(time.mktime(time.localtime( os.path.getmtime(vidfile)))) infps = cap.get(cv2.CAP_PROP_FPS) dt = np.arange(frame_count) / infps ts = tstart + pd.to_timedelta(dt, unit='s') timestamps = pd.DataFrame({'frame': np.arange(frame_count), 'timestamp': ts}) else: timestamps['timestamp'] = pd.to_datetime(timestamps['timestamp']) tstart = timestamps['timestamp'].min() win = os.path.basename(vidfile) cv2.namedWindow(win, cv2.WINDOW_NORMAL) colors = {} for ii in set(tracks.trackid.values): if color == 'auto': val = murmurhash3_32(int(ii), positive=True).to_bytes(8, 'little') colors[ii] = (val[0], val[1], val[2]) else: colors[ii] = (0, 0, 255) out = None width = None height = None scale_x = 1 scale_y = 1 if vout is not None: fourcc = cv2.VideoWriter_fourcc(*outfmt) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) if vwidth is not None or vheight is not None: w, h = resize_dim(width, height, vwidth, vheight) scale_x = w / float(width) scale_y = h / float(height) width, height = (w, h) if fps is None: fps = infps out = cv2.VideoWriter(vout, fourcc, fps, (width, height)) print(f'Saving video with tracks in {vout}. Video format {outfmt}') frame_no = -1 if fstart > 0: frame_no = fstart - 1 cap.set(cv2.CAP_PROP_POS_FRAMES, fstart) if fend < 0: fend = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) while frame_no < fend: ret, frame = cap.read() if frame is None: print('End at frame', frame_no) break frame_no += 1 trackdata = tracks[tracks.frame == frame_no] if (len(trackdata) == 0) and skipempty: continue if vout is not None and vwidth is not None or vheight is not None: frame = cv2.resize(frame, (width, height), cv2.INTER_AREA) if timestamp: cv2.putText(frame, str(int(frame_no)), (100, 100), cv2.FONT_HERSHEY_COMPLEX, fontscale, (255, 255, 0), fthickness, cv2.LINE_AA) ts = timestamps[timestamps['frame'] == frame_no]['timestamp'].iloc[ 0] if dt: ts = ts - timestamps['timestamp'].min() cv2.putText(frame, str(ts), (frame.shape[1] - 200, 100), cv2.FONT_HERSHEY_COMPLEX, fontscale, (255, 255, 0), fthickness, cv2.LINE_AA) # Get the trail of the track (history) hist = None if trail > 0: if trail_sec: ts = \ timestamps[timestamps['frame'] == frame_no][ 'timestamp'].iloc[0] tdelta = ts - timestamps['timestamp'] ds = tdelta.dt.total_seconds() tgood = timestamps[(0 < ds) & (ds < trail)] hist = pd.merge(tracks, tgood, how='inner', on='frame') else: hist = tracks[(tracks.frame < frame_no) & (tracks.frame >= frame_no - trail)] for row in trackdata.itertuples(): # print(f'{row.x}\n{row.y}\n{row.w}\n=====') id_ = int(row.trackid) if hist is not None: cur_hist = hist[hist.trackid == id_] hx = cur_hist.x.values + cur_hist.w.values / 2.0 hy = cur_hist.y.values + cur_hist.h.values / 2.0 [cv2.circle(frame, (int(_hx), int(_hy)), 1, colors[id_], -1) for _hx, _hy in zip(hx, hy)] # print(id_, colors[id_]) cv2.rectangle(frame, (int(row.x * scale_x), int(row.y * scale_y)), (int((row.x + row.w) * scale_x), int((row.y + row.h) * scale_y)), colors[id_], lw) cv2.putText(frame, str(id_), (int(row.x * scale_x), int(row.y * scale_y)), cv2.FONT_HERSHEY_COMPLEX, fontscale, colors[id_], fthickness, cv2.LINE_AA) cv2.imshow(win, frame) if out is not None: out.write(frame) key = cv2.waitKey(100) if key == ord('q') or key == 27: break if out is not None: out.release() cap.release()
def test_no_collision_on_byte_range(): previous_hashes = set() for i in range(100): h = murmurhash3_32(' ' * i, 0) assert_true(h not in previous_hashes, "Found collision on growing empty string")
def test_mmhash3_unicode(): assert_equal(murmurhash3_32(u('foo'), 0), -156908512) assert_equal(murmurhash3_32(u('foo'), 42), -1322301282) assert_equal(murmurhash3_32(u('foo'), 0, positive=True), 4138058784) assert_equal(murmurhash3_32(u('foo'), 42, positive=True), 2972666014)
def test_mmhash3_bytes(): assert_equal(murmurhash3_32('foo', 0), -156908512) assert_equal(murmurhash3_32('foo', 42), -1322301282) assert_equal(murmurhash3_32('foo', 0, positive=True), 4138058784L) assert_equal(murmurhash3_32('foo', 42, positive=True), 2972666014L)
def test_no_collision_on_byte_range(): previous_hashes = set() for i in range(100): h = murmurhash3_32(' ' * i, 0) assert h not in previous_hashes, \ "Found collision on growing empty string"
def test_mmhash3_unicode(): assert murmurhash3_32('foo', 0) == -156908512 assert murmurhash3_32('foo', 42) == -1322301282 assert murmurhash3_32('foo', 0, positive=True) == 4138058784 assert murmurhash3_32('foo', 42, positive=True) == 2972666014
def plot_tracks(trackfile, ms=5, lw=5, show_bbox=True, bbox_alpha=(0.0, 1.0), plot_alpha=1.0, quiver=True, qcmap='hot', qwidth=-1, vidfile=None, frame=-1, fstart=0, fend=-1, gray=False, randcolor=True, axes=False): if trackfile.endswith('.csv'): tracks = pd.read_csv(trackfile) else: tracks = pd.read_hdf(trackfile, 'tracked') tracks.describe() # print('%%%%', bbox_alpha) img = None fig, ax = plt.subplots() if vidfile is not None: cap = cv2.VideoCapture(vidfile) if frame < 0: frame = cap.get(cv2.CAP_PROP_FRAME_COUNT) - 1 cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame)) ret, img = cap.read() if img is None: print('Could not read image') elif img.shape[-1] == 3: # BGR if gray: img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) else: img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) elif len(img.shape) == 2: gray = True if img is not None: if gray: ax.imshow(img, origin='upper', cmap='gray') else: ax.imshow(img, origin='upper') if not axes: ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) [ax.spines[s].set_visible(False) for s in ['left', 'bottom', 'top', 'right']] print('Unique tracks:', len(tracks.trackid.unique())) if fend < 0: fend = tracks.frame.max() tracks = tracks[(tracks.frame >= fstart) & (tracks.frame <= fend)].copy() for trackid, trackgrp in tracks.groupby('trackid'): pos = trackgrp.sort_values(by='frame') cx = pos.x + pos.w / 2.0 # The Y axis is inverted when using image. # Keep it consistent when no image is used. if img is None: cy = - (pos.y + pos.h / 2.0) else: cy = pos.y + pos.h / 2.0 val = murmurhash3_32(int(trackid), positive=True).to_bytes(8, 'little') color = (val[0] / 255.0, val[1] / 255.0, val[2] / 255.0) if show_bbox: alpha = np.linspace(bbox_alpha[0], bbox_alpha[1], len(pos)) ii = 0 for p in pos.itertuples(): bbox = plt.Rectangle((p.x, p.y), p.w, p.h, linewidth=lw, edgecolor=color, facecolor='none', alpha=alpha[ii]) ii += 1 ax.add_patch(bbox) if quiver: u = np.diff(cx) v = np.diff(cy) c = np.linspace(0, 1, len(u)) if qwidth <= 0: ax.quiver(cx[:-1], cy[:-1], u, v, c, scale_units='xy', angles='xy', scale=1, cmap=qcmap) else: ax.quiver(cx[:-1], cy[:-1], u, v, c, units='xy', scale_units='xy', angles='xy', scale=1, width=qwidth, cmap=qcmap) elif randcolor: plt.plot(cx, cy, '.-', color=color, ms=ms, alpha=plot_alpha, label=str(trackid)) else: plt.plot(cx, cy, '.-', ms=ms, alpha=plot_alpha, label=str(trackid)) fig.tight_layout() return fig
def hash(gram, hash_size): """ 把gram hash到有限的hash_size空间 """ return murmurhash3_32(gram, positive=True) % hash_size
def test_mmhash3_bytes(): assert murmurhash3_32(b"foo", 0) == -156908512 assert murmurhash3_32(b"foo", 42) == -1322301282 assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784 assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014