def test_mmhash3_int():
    assert_equal(murmurhash3_32(3), 847579505)
    assert_equal(murmurhash3_32(3, seed=0), 847579505)
    assert_equal(murmurhash3_32(3, seed=42), -1823081949)

    assert_equal(murmurhash3_32(3, positive=False), 847579505)
    assert_equal(murmurhash3_32(3, seed=0, positive=False), 847579505)
    assert_equal(murmurhash3_32(3, seed=42, positive=False), -1823081949)

    assert_equal(murmurhash3_32(3, positive=True), 847579505)
    assert_equal(murmurhash3_32(3, seed=0, positive=True), 847579505)
    assert_equal(murmurhash3_32(3, seed=42, positive=True), 2471885347)
Example #2
0
def generate_input_data(source_file):
    """"""
    from sklearn.utils.murmurhash import murmurhash3_32
    from csv import DictReader
    
    new_file = source_file[:-16] + "hashed.txt"
    fw = open(new_file, mode = 'w')
    for count, row in enumerate(DictReader(open(source_file, mode='r'))):
        if source_file == "train_data_preprocessed.csv":
            fw.write(str(row["click"]))
            del row["click"]
        else:
            assert source_file == "test_data_preprocessed.csv"
            fw.write(str(count))
            
        for k, value in row.items():
            row[k] = murmurhash3_32(str(k) + str(value), positive = True)
        
        row_sorted = sorted(row.items(), key=lambda x: x[1])
        for k, value in row_sorted:
            fw.write(" " + str(value) + ':1')
        fw.write("\n")
       
        if count % 1000000 == 0:
            print("{0} row finished".format(count))
def test_mmhash3_int_array():
    rng = np.random.RandomState(42)
    keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
    keys = keys.reshape((3, 2, 1))

    for seed in [0, 42]:
        expected = np.array([murmurhash3_32(int(k), seed)
                             for k in keys.flat])
        expected = expected.reshape(keys.shape)
        assert_array_equal(murmurhash3_32(keys, seed), expected)

    for seed in [0, 42]:
        expected = np.array([murmurhash3_32(k, seed, positive=True)
                             for k in keys.flat])
        expected = expected.reshape(keys.shape)
        assert_array_equal(murmurhash3_32(keys, seed, positive=True),
                           expected)
def test_uniform_distribution():
    n_bins, n_samples = 10, 100000
    bins = np.zeros(n_bins, dtype=np.float64)

    for i in range(n_samples):
        bins[murmurhash3_32(i, positive=True) % n_bins] += 1

    means = bins / n_samples
    expected = np.ones(n_bins) / n_bins

    assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
Example #5
0
def test_uniform_distribution():
    n_bins, n_samples = 10, 100000
    bins = np.zeros(n_bins, dtype=np.float)

    for i in range(n_samples):
        bins[murmurhash3_32(i, positive=True) % n_bins] += 1

    means = bins / n_samples
    expected = np.ones(n_bins) / n_bins

    assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
Example #6
0
 def _f(f):
     if partition_no >= 0:
         file_partition = murmurhash3_32(f, args.seed) % partitions_count
         if file_partition != partition_no:
             return False
     file_name = f.rsplit(".", 1)[0]
     file_path = dest_dir + 'large' + '/v1_' + file_name + '.jpg'
     if args.no_skip:
         return True
     if os.path.exists(file_path) and os.path.isfile(file_path):
         return False
     return True
Example #7
0
def single_text_hash(x: str) -> str:
    """Get text hash.

    Args:
        x: text.

    Returns:
        string text hash.

    """
    numhash = murmurhash3_32(x, seed=13)
    texthash = str(numhash) if numhash > 0 else 'm' + str(abs(numhash))
    return texthash
Example #8
0
    def _make_category(df: DataFrame, cols: Sequence[str]) -> np.ndarray:
        """Make hash for category interactions.

        Args:
            df: Input DataFrame
            cols: List of columns

        Returns:
            Hash np.ndarray.

        """
        res = np.empty((df.shape[0], ), dtype=np.int32)

        for n, inter in enumerate(zip(*(df[x] for x in cols))):
            h = murmurhash3_32("_".join(map(str, inter)), seed=42)
            res[n] = h

        return res
Example #9
0
def make_color(num: int) -> Tuple[int]:
    """Create a random color based on number.

    The provided number is passed through the murmur hash function in order
    to generate bytes which are somewhat apart from each other. The three least
    significant byte values are taken as r, g, and b.

    Parameters
    ----------
    num: int
        number to use as hash key

    Returns
    -------
    bytes[3]
        (r, g, b) values

    """
    val = murmurhash3_32(num, positive=True).to_bytes(8, 'little')
    # color = qg.QColor(val[0], val[1], val[2])
    return val[:3]
Example #10
0
def play_tracks(vidfile, trackfile, lw=2, color='auto',
                fontscale=1, fthickness=1,
                fstart=0, fend=-1, trail=0, trail_sec=False,
                torigfile=None, tmtfile=None,
                vout=None, outfmt='MJPG', fps=None, vwidth=None, vheight=None,
                timestamp=False, dt=True, skipempty=False):
    """
    Play the video from `vidfile` and overlay the bounding boxes and IDs of the
    tracked animals from `trackfile`.

    Parameters
    ----------
    vidfile: str
        Path of video file.
    trackfile: str
        Path of track data file.
    lw: int
        Line width for drawing
    color: str
        If ``auto``, use random color for each animal.
        Anything but ``auto`` will use red.
    fontscale: float
        Scaling for text font
    fthickness: int
        Font thickness in OpenCV text display
    fstart: int
        Frame # to start with
    fend: int
        Last frame # to process (-1 means end of video)
    trail: int
        Number of frames or seconds of trail to show before current frame. Thus
        with `trail`=100, the positions in the past 100 frames will be drawn
        like a tail behind each animal.
    trail_sec: bool
        Whether the `trail` is number of frames or number of seconds.
    torigfile: str
        Path to original timestamp file.
    tmtfile: str
        Path to motion-tracked timestamp file.
    vout: str
        output video file path.
    outfmt: str
        output video format
    fps: float
        fps of output video
    vwidth: int
        Output video width.
    vheight: int
        Output video height.
    timestamp: bool
        If `True` then show timestamp.
    dt: bool
        If `True`, and `timestamp` is also `True`, then show time elapsed from
        start instead of timestamp.
    skipempty: bool
        If `True` skip frames without any track.
    """
    cap = cv2.VideoCapture(vidfile)
    if not cap.isOpened():
        print('Could not open file', vidfile)
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    if trackfile.endswith('.csv'):
        tracks = pd.read_csv(trackfile)
    else:
        tracks = pd.read_hdf(trackfile, 'tracked')
    timestamps = None
    if torigfile is not None:
        torig = pd.read_csv(torigfile)
        timestamps = torig
    if tmtfile is not None:
        tmt = pd.read_csv(tmtfile)
        timestamps = pd.merge(torig, tmt, left_on='outframe',
                              right_on='inframe')
        timestamps.drop(['outframe_x', 'timestamp_y', 'inframe_x',
                         'inframe_y'],
                        axis=1, inplace=True)
        timestamps.rename({'inframe_x': 'origframe', 'inframe_y': 'inframe',
                           'timestamp_x': 'timestamp', 'outframe_y': 'frame'},
                          axis=1,
                          inplace=True)

    if timestamps is None:
        tstart = datetime.fromtimestamp(time.mktime(time.localtime(
            os.path.getmtime(vidfile))))
        infps = cap.get(cv2.CAP_PROP_FPS)
        dt = np.arange(frame_count) / infps
        ts = tstart + pd.to_timedelta(dt, unit='s')
        timestamps = pd.DataFrame({'frame': np.arange(frame_count),
                                   'timestamp': ts})
    else:
        timestamps['timestamp'] = pd.to_datetime(timestamps['timestamp'])
        tstart = timestamps['timestamp'].min()
    win = os.path.basename(vidfile)
    cv2.namedWindow(win, cv2.WINDOW_NORMAL)
    colors = {}
    for ii in set(tracks.trackid.values):
        if color == 'auto':
            val = murmurhash3_32(int(ii), positive=True).to_bytes(8, 'little')
            colors[ii] = (val[0], val[1], val[2])

        else:
            colors[ii] = (0, 0, 255)
    out = None
    width = None
    height = None
    scale_x = 1
    scale_y = 1
    if vout is not None:
        fourcc = cv2.VideoWriter_fourcc(*outfmt)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        if vwidth is not None or vheight is not None:
            w, h = resize_dim(width, height, vwidth, vheight)
            scale_x = w / float(width)
            scale_y = h / float(height)
            width, height = (w, h)
        if fps is None:
            fps = infps
        out = cv2.VideoWriter(vout, fourcc, fps,
                              (width, height))
        print(f'Saving video with tracks in {vout}. Video format {outfmt}')
    frame_no = -1
    if fstart > 0:
        frame_no = fstart - 1
        cap.set(cv2.CAP_PROP_POS_FRAMES, fstart)
    if fend < 0:
        fend = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    while frame_no < fend:
        ret, frame = cap.read()
        if frame is None:
            print('End at frame', frame_no)
            break
        frame_no += 1
        trackdata = tracks[tracks.frame == frame_no]
        if (len(trackdata) == 0) and skipempty:
            continue
        if vout is not None and vwidth is not None or vheight is not None:
            frame = cv2.resize(frame, (width, height), cv2.INTER_AREA)
        if timestamp:
            cv2.putText(frame, str(int(frame_no)), (100, 100),
                        cv2.FONT_HERSHEY_COMPLEX, fontscale, (255, 255, 0),
                        fthickness, cv2.LINE_AA)
            ts = timestamps[timestamps['frame'] == frame_no]['timestamp'].iloc[
                0]
            if dt:
                ts = ts - timestamps['timestamp'].min()
            cv2.putText(frame, str(ts), (frame.shape[1] - 200, 100),
                        cv2.FONT_HERSHEY_COMPLEX, fontscale, (255, 255, 0),
                        fthickness, cv2.LINE_AA)
        # Get the trail of the track (history)
        hist = None
        if trail > 0:
            if trail_sec:
                ts = \
                    timestamps[timestamps['frame'] == frame_no][
                        'timestamp'].iloc[0]
                tdelta = ts - timestamps['timestamp']
                ds = tdelta.dt.total_seconds()
                tgood = timestamps[(0 < ds) & (ds < trail)]
                hist = pd.merge(tracks, tgood, how='inner',
                                on='frame')
            else:
                hist = tracks[(tracks.frame < frame_no) &
                              (tracks.frame >= frame_no - trail)]
        for row in trackdata.itertuples():
            # print(f'{row.x}\n{row.y}\n{row.w}\n=====')
            id_ = int(row.trackid)
            if hist is not None:
                cur_hist = hist[hist.trackid == id_]
                hx = cur_hist.x.values + cur_hist.w.values / 2.0
                hy = cur_hist.y.values + cur_hist.h.values / 2.0
                [cv2.circle(frame, (int(_hx), int(_hy)), 1, colors[id_], -1)
                 for _hx, _hy in zip(hx, hy)]
            # print(id_, colors[id_])
            cv2.rectangle(frame, (int(row.x * scale_x), int(row.y * scale_y)),
                          (int((row.x + row.w) * scale_x), int((row.y + row.h) * scale_y)),
                          colors[id_], lw)
            cv2.putText(frame, str(id_), (int(row.x * scale_x), int(row.y * scale_y)),
                        cv2.FONT_HERSHEY_COMPLEX, fontscale, colors[id_],
                        fthickness, cv2.LINE_AA)
        cv2.imshow(win, frame)
        if out is not None:
            out.write(frame)
        key = cv2.waitKey(100)
        if key == ord('q') or key == 27:
            break
    if out is not None:
        out.release()
    cap.release()
def test_no_collision_on_byte_range():
    previous_hashes = set()
    for i in range(100):
        h = murmurhash3_32(' ' * i, 0)
        assert_true(h not in previous_hashes,
                    "Found collision on growing empty string")
def test_mmhash3_unicode():
    assert_equal(murmurhash3_32(u('foo'), 0), -156908512)
    assert_equal(murmurhash3_32(u('foo'), 42), -1322301282)

    assert_equal(murmurhash3_32(u('foo'), 0, positive=True), 4138058784)
    assert_equal(murmurhash3_32(u('foo'), 42, positive=True), 2972666014)
def test_mmhash3_bytes():
    assert_equal(murmurhash3_32('foo', 0), -156908512)
    assert_equal(murmurhash3_32('foo', 42), -1322301282)

    assert_equal(murmurhash3_32('foo', 0, positive=True), 4138058784L)
    assert_equal(murmurhash3_32('foo', 42, positive=True), 2972666014L)
def test_no_collision_on_byte_range():
    previous_hashes = set()
    for i in range(100):
        h = murmurhash3_32(' ' * i, 0)
        assert h not in previous_hashes, \
            "Found collision on growing empty string"
def test_mmhash3_unicode():
    assert murmurhash3_32('foo', 0) == -156908512
    assert murmurhash3_32('foo', 42) == -1322301282

    assert murmurhash3_32('foo', 0, positive=True) == 4138058784
    assert murmurhash3_32('foo', 42, positive=True) == 2972666014
Example #16
0
def plot_tracks(trackfile, ms=5, lw=5, show_bbox=True,
                bbox_alpha=(0.0, 1.0), plot_alpha=1.0, quiver=True,
                qcmap='hot', qwidth=-1, vidfile=None,
                frame=-1, fstart=0, fend=-1,
                gray=False,
                randcolor=True,
                axes=False):
    if trackfile.endswith('.csv'):
        tracks = pd.read_csv(trackfile)
    else:
        tracks = pd.read_hdf(trackfile, 'tracked')
    tracks.describe()
    # print('%%%%', bbox_alpha)
    img = None
    fig, ax = plt.subplots()
    if vidfile is not None:
        cap = cv2.VideoCapture(vidfile)
        if frame < 0:
            frame = cap.get(cv2.CAP_PROP_FRAME_COUNT) - 1
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame))
        ret, img = cap.read()
        if img is None:
            print('Could not read image')
        elif img.shape[-1] == 3:  # BGR
            if gray:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        elif len(img.shape) == 2:
            gray = True
    if img is not None:
        if gray:
            ax.imshow(img, origin='upper', cmap='gray')
        else:
            ax.imshow(img, origin='upper')
        if not axes:
            ax.xaxis.set_visible(False)
            ax.yaxis.set_visible(False)
            [ax.spines[s].set_visible(False)
             for s in ['left', 'bottom', 'top', 'right']]

    print('Unique tracks:', len(tracks.trackid.unique()))
    if fend < 0:
        fend = tracks.frame.max()
    tracks = tracks[(tracks.frame >= fstart) & (tracks.frame <= fend)].copy()
    for trackid, trackgrp in tracks.groupby('trackid'):
        pos = trackgrp.sort_values(by='frame')
        cx = pos.x + pos.w / 2.0
        # The Y axis is inverted when using image.
        # Keep it consistent when no image is used.
        if img is None:
            cy = - (pos.y + pos.h / 2.0)
        else:
            cy = pos.y + pos.h / 2.0

        val = murmurhash3_32(int(trackid), positive=True).to_bytes(8, 'little')
        color = (val[0] / 255.0, val[1] / 255.0, val[2] / 255.0)
        if show_bbox:
            alpha = np.linspace(bbox_alpha[0], bbox_alpha[1], len(pos))
            ii = 0
            for p in pos.itertuples():
                bbox = plt.Rectangle((p.x, p.y),
                                     p.w, p.h,
                                     linewidth=lw,
                                     edgecolor=color,
                                     facecolor='none',
                                     alpha=alpha[ii])
                ii += 1
                ax.add_patch(bbox)

        if quiver:
            u = np.diff(cx)
            v = np.diff(cy)
            c = np.linspace(0, 1, len(u))
            if qwidth <= 0:
                ax.quiver(cx[:-1], cy[:-1], u, v, c,
                          scale_units='xy', angles='xy',
                          scale=1, cmap=qcmap)
            else:
                ax.quiver(cx[:-1], cy[:-1], u, v, c,
                          units='xy',
                          scale_units='xy', angles='xy',
                          scale=1, width=qwidth, cmap=qcmap)
        elif randcolor:
            plt.plot(cx, cy, '.-', color=color, ms=ms, alpha=plot_alpha,
                     label=str(trackid))
        else:
            plt.plot(cx, cy, '.-', ms=ms, alpha=plot_alpha,
                     label=str(trackid))
    fig.tight_layout()
    return fig
Example #17
0
def hash(gram, hash_size):
    """ 把gram hash到有限的hash_size空间 """
    return murmurhash3_32(gram, positive=True) % hash_size
Example #18
0
def test_mmhash3_bytes():
    assert murmurhash3_32(b"foo", 0) == -156908512
    assert murmurhash3_32(b"foo", 42) == -1322301282

    assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
    assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014
Example #19
0
def test_mmhash3_unicode():
    assert_equal(murmurhash3_32(u('foo'), 0), -156908512)
    assert_equal(murmurhash3_32(u('foo'), 42), -1322301282)

    assert_equal(murmurhash3_32(u('foo'), 0, positive=True), 4138058784)
    assert_equal(murmurhash3_32(u('foo'), 42, positive=True), 2972666014)
def test_mmhash3_bytes():
    assert_equal(murmurhash3_32('foo', 0), -156908512)
    assert_equal(murmurhash3_32('foo', 42), -1322301282)

    assert_equal(murmurhash3_32('foo', 0, positive=True), 4138058784L)
    assert_equal(murmurhash3_32('foo', 42, positive=True), 2972666014L)