Python murmurhash3_32の例、sklearn.utils.murmurhash.murmurhash3_32 Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: AlexandreAbraham/scikit-learn

def test_mmhash3_int():
    assert_equal(murmurhash3_32(3), 847579505)
    assert_equal(murmurhash3_32(3, seed=0), 847579505)
    assert_equal(murmurhash3_32(3, seed=42), -1823081949)

    assert_equal(murmurhash3_32(3, positive=False), 847579505)
    assert_equal(murmurhash3_32(3, seed=0, positive=False), 847579505)
    assert_equal(murmurhash3_32(3, seed=42, positive=False), -1823081949)

    assert_equal(murmurhash3_32(3, positive=True), 847579505)
    assert_equal(murmurhash3_32(3, seed=0, positive=True), 847579505)
    assert_equal(murmurhash3_32(3, seed=42, positive=True), 2471885347)

コード例 #2

0

ファイルを表示

ファイル: data_preprocess.py プロジェクト: ld0214/ctr_prediction

def generate_input_data(source_file):
    """"""
    from sklearn.utils.murmurhash import murmurhash3_32
    from csv import DictReader
    
    new_file = source_file[:-16] + "hashed.txt"
    fw = open(new_file, mode = 'w')
    for count, row in enumerate(DictReader(open(source_file, mode='r'))):
        if source_file == "train_data_preprocessed.csv":
            fw.write(str(row["click"]))
            del row["click"]
        else:
            assert source_file == "test_data_preprocessed.csv"
            fw.write(str(count))
            
        for k, value in row.items():
            row[k] = murmurhash3_32(str(k) + str(value), positive = True)
        
        row_sorted = sorted(row.items(), key=lambda x: x[1])
        for k, value in row_sorted:
            fw.write(" " + str(value) + ':1')
        fw.write("\n")
       
        if count % 1000000 == 0:
            print("{0} row finished".format(count))

コード例 #3

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: AlexandreAbraham/scikit-learn

def test_mmhash3_int_array():
    rng = np.random.RandomState(42)
    keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
    keys = keys.reshape((3, 2, 1))

    for seed in [0, 42]:
        expected = np.array([murmurhash3_32(int(k), seed)
                             for k in keys.flat])
        expected = expected.reshape(keys.shape)
        assert_array_equal(murmurhash3_32(keys, seed), expected)

    for seed in [0, 42]:
        expected = np.array([murmurhash3_32(k, seed, positive=True)
                             for k in keys.flat])
        expected = expected.reshape(keys.shape)
        assert_array_equal(murmurhash3_32(keys, seed, positive=True),
                           expected)

コード例 #4

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: AlexandreAbraham/scikit-learn

def test_uniform_distribution():
    n_bins, n_samples = 10, 100000
    bins = np.zeros(n_bins, dtype=np.float64)

    for i in range(n_samples):
        bins[murmurhash3_32(i, positive=True) % n_bins] += 1

    means = bins / n_samples
    expected = np.ones(n_bins) / n_bins

    assert_array_almost_equal(means / expected, np.ones(n_bins), 2)

コード例 #5

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: tnunes/scikit-learn

def test_uniform_distribution():
    n_bins, n_samples = 10, 100000
    bins = np.zeros(n_bins, dtype=np.float)

    for i in range(n_samples):
        bins[murmurhash3_32(i, positive=True) % n_bins] += 1

    means = bins / n_samples
    expected = np.ones(n_bins) / n_bins

    assert_array_almost_equal(means / expected, np.ones(n_bins), 2)

コード例 #6

0

ファイルを表示

 def _f(f):
     if partition_no >= 0:
         file_partition = murmurhash3_32(f, args.seed) % partitions_count
         if file_partition != partition_no:
             return False
     file_name = f.rsplit(".", 1)[0]
     file_path = dest_dir + 'large' + '/v1_' + file_name + '.jpg'
     if args.no_skip:
         return True
     if os.path.exists(file_path) and os.path.isfile(file_path):
         return False
     return True

コード例 #7

0

ファイルを表示

ファイル: utils.py プロジェクト: tesemnikov-av/LightAutoML

def single_text_hash(x: str) -> str:
    """Get text hash.

    Args:
        x: text.

    Returns:
        string text hash.

    """
    numhash = murmurhash3_32(x, seed=13)
    texthash = str(numhash) if numhash > 0 else 'm' + str(abs(numhash))
    return texthash

コード例 #8

0

ファイルを表示

    def _make_category(df: DataFrame, cols: Sequence[str]) -> np.ndarray:
        """Make hash for category interactions.

        Args:
            df: Input DataFrame
            cols: List of columns

        Returns:
            Hash np.ndarray.

        """
        res = np.empty((df.shape[0], ), dtype=np.int32)

        for n, inter in enumerate(zip(*(df[x] for x in cols))):
            h = murmurhash3_32("_".join(map(str, inter)), seed=42)
            res[n] = h

        return res

コード例 #9

0

ファイルを表示

def make_color(num: int) -> Tuple[int]:
    """Create a random color based on number.

    The provided number is passed through the murmur hash function in order
    to generate bytes which are somewhat apart from each other. The three least
    significant byte values are taken as r, g, and b.

    Parameters
    ----------
    num: int
        number to use as hash key

    Returns
    -------
    bytes[3]
        (r, g, b) values

    """
    val = murmurhash3_32(num, positive=True).to_bytes(8, 'little')
    # color = qg.QColor(val[0], val[1], val[2])
    return val[:3]

コード例 #10

0

ファイルを表示

ファイル: plot_tracks.py プロジェクト: subhacom/argos

def play_tracks(vidfile, trackfile, lw=2, color='auto',
                fontscale=1, fthickness=1,
                fstart=0, fend=-1, trail=0, trail_sec=False,
                torigfile=None, tmtfile=None,
                vout=None, outfmt='MJPG', fps=None, vwidth=None, vheight=None,
                timestamp=False, dt=True, skipempty=False):
    """
    Play the video from `vidfile` and overlay the bounding boxes and IDs of the
    tracked animals from `trackfile`.

    Parameters
    ----------
    vidfile: str
        Path of video file.
    trackfile: str
        Path of track data file.
    lw: int
        Line width for drawing
    color: str
        If ``auto``, use random color for each animal.
        Anything but ``auto`` will use red.
    fontscale: float
        Scaling for text font
    fthickness: int
        Font thickness in OpenCV text display
    fstart: int
        Frame # to start with
    fend: int
        Last frame # to process (-1 means end of video)
    trail: int
        Number of frames or seconds of trail to show before current frame. Thus
        with `trail`=100, the positions in the past 100 frames will be drawn
        like a tail behind each animal.
    trail_sec: bool
        Whether the `trail` is number of frames or number of seconds.
    torigfile: str
        Path to original timestamp file.
    tmtfile: str
        Path to motion-tracked timestamp file.
    vout: str
        output video file path.
    outfmt: str
        output video format
    fps: float
        fps of output video
    vwidth: int
        Output video width.
    vheight: int
        Output video height.
    timestamp: bool
        If `True` then show timestamp.
    dt: bool
        If `True`, and `timestamp` is also `True`, then show time elapsed from
        start instead of timestamp.
    skipempty: bool
        If `True` skip frames without any track.
    """
    cap = cv2.VideoCapture(vidfile)
    if not cap.isOpened():
        print('Could not open file', vidfile)
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    if trackfile.endswith('.csv'):
        tracks = pd.read_csv(trackfile)
    else:
        tracks = pd.read_hdf(trackfile, 'tracked')
    timestamps = None
    if torigfile is not None:
        torig = pd.read_csv(torigfile)
        timestamps = torig
    if tmtfile is not None:
        tmt = pd.read_csv(tmtfile)
        timestamps = pd.merge(torig, tmt, left_on='outframe',
                              right_on='inframe')
        timestamps.drop(['outframe_x', 'timestamp_y', 'inframe_x',
                         'inframe_y'],
                        axis=1, inplace=True)
        timestamps.rename({'inframe_x': 'origframe', 'inframe_y': 'inframe',
                           'timestamp_x': 'timestamp', 'outframe_y': 'frame'},
                          axis=1,
                          inplace=True)

    if timestamps is None:
        tstart = datetime.fromtimestamp(time.mktime(time.localtime(
            os.path.getmtime(vidfile))))
        infps = cap.get(cv2.CAP_PROP_FPS)
        dt = np.arange(frame_count) / infps
        ts = tstart + pd.to_timedelta(dt, unit='s')
        timestamps = pd.DataFrame({'frame': np.arange(frame_count),
                                   'timestamp': ts})
    else:
        timestamps['timestamp'] = pd.to_datetime(timestamps['timestamp'])
        tstart = timestamps['timestamp'].min()
    win = os.path.basename(vidfile)
    cv2.namedWindow(win, cv2.WINDOW_NORMAL)
    colors = {}
    for ii in set(tracks.trackid.values):
        if color == 'auto':
            val = murmurhash3_32(int(ii), positive=True).to_bytes(8, 'little')
            colors[ii] = (val[0], val[1], val[2])

        else:
            colors[ii] = (0, 0, 255)
    out = None
    width = None
    height = None
    scale_x = 1
    scale_y = 1
    if vout is not None:
        fourcc = cv2.VideoWriter_fourcc(*outfmt)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        if vwidth is not None or vheight is not None:
            w, h = resize_dim(width, height, vwidth, vheight)
            scale_x = w / float(width)
            scale_y = h / float(height)
            width, height = (w, h)
        if fps is None:
            fps = infps
        out = cv2.VideoWriter(vout, fourcc, fps,
                              (width, height))
        print(f'Saving video with tracks in {vout}. Video format {outfmt}')
    frame_no = -1
    if fstart > 0:
        frame_no = fstart - 1
        cap.set(cv2.CAP_PROP_POS_FRAMES, fstart)
    if fend < 0:
        fend = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    while frame_no < fend:
        ret, frame = cap.read()
        if frame is None:
            print('End at frame', frame_no)
            break
        frame_no += 1
        trackdata = tracks[tracks.frame == frame_no]
        if (len(trackdata) == 0) and skipempty:
            continue
        if vout is not None and vwidth is not None or vheight is not None:
            frame = cv2.resize(frame, (width, height), cv2.INTER_AREA)
        if timestamp:
            cv2.putText(frame, str(int(frame_no)), (100, 100),
                        cv2.FONT_HERSHEY_COMPLEX, fontscale, (255, 255, 0),
                        fthickness, cv2.LINE_AA)
            ts = timestamps[timestamps['frame'] == frame_no]['timestamp'].iloc[
                0]
            if dt:
                ts = ts - timestamps['timestamp'].min()
            cv2.putText(frame, str(ts), (frame.shape[1] - 200, 100),
                        cv2.FONT_HERSHEY_COMPLEX, fontscale, (255, 255, 0),
                        fthickness, cv2.LINE_AA)
        # Get the trail of the track (history)
        hist = None
        if trail > 0:
            if trail_sec:
                ts = \
                    timestamps[timestamps['frame'] == frame_no][
                        'timestamp'].iloc[0]
                tdelta = ts - timestamps['timestamp']
                ds = tdelta.dt.total_seconds()
                tgood = timestamps[(0 < ds) & (ds < trail)]
                hist = pd.merge(tracks, tgood, how='inner',
                                on='frame')
            else:
                hist = tracks[(tracks.frame < frame_no) &
                              (tracks.frame >= frame_no - trail)]
        for row in trackdata.itertuples():
            # print(f'{row.x}\n{row.y}\n{row.w}\n=====')
            id_ = int(row.trackid)
            if hist is not None:
                cur_hist = hist[hist.trackid == id_]
                hx = cur_hist.x.values + cur_hist.w.values / 2.0
                hy = cur_hist.y.values + cur_hist.h.values / 2.0
                [cv2.circle(frame, (int(_hx), int(_hy)), 1, colors[id_], -1)
                 for _hx, _hy in zip(hx, hy)]
            # print(id_, colors[id_])
            cv2.rectangle(frame, (int(row.x * scale_x), int(row.y * scale_y)),
                          (int((row.x + row.w) * scale_x), int((row.y + row.h) * scale_y)),
                          colors[id_], lw)
            cv2.putText(frame, str(id_), (int(row.x * scale_x), int(row.y * scale_y)),
                        cv2.FONT_HERSHEY_COMPLEX, fontscale, colors[id_],
                        fthickness, cv2.LINE_AA)
        cv2.imshow(win, frame)
        if out is not None:
            out.write(frame)
        key = cv2.waitKey(100)
        if key == ord('q') or key == 27:
            break
    if out is not None:
        out.release()
    cap.release()

コード例 #11

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: AlexandreAbraham/scikit-learn

def test_no_collision_on_byte_range():
    previous_hashes = set()
    for i in range(100):
        h = murmurhash3_32(' ' * i, 0)
        assert_true(h not in previous_hashes,
                    "Found collision on growing empty string")

コード例 #12

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: AlexandreAbraham/scikit-learn

def test_mmhash3_unicode():
    assert_equal(murmurhash3_32(u('foo'), 0), -156908512)
    assert_equal(murmurhash3_32(u('foo'), 42), -1322301282)

    assert_equal(murmurhash3_32(u('foo'), 0, positive=True), 4138058784)
    assert_equal(murmurhash3_32(u('foo'), 42, positive=True), 2972666014)

コード例 #13

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: AlexLerman/scikit-learn

def test_mmhash3_bytes():
    assert_equal(murmurhash3_32('foo', 0), -156908512)
    assert_equal(murmurhash3_32('foo', 42), -1322301282)

    assert_equal(murmurhash3_32('foo', 0, positive=True), 4138058784L)
    assert_equal(murmurhash3_32('foo', 42, positive=True), 2972666014L)

コード例 #14

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: danielmoreira12/BAProject

def test_no_collision_on_byte_range():
    previous_hashes = set()
    for i in range(100):
        h = murmurhash3_32(' ' * i, 0)
        assert h not in previous_hashes, \
            "Found collision on growing empty string"

コード例 #15

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: danielmoreira12/BAProject

def test_mmhash3_unicode():
    assert murmurhash3_32('foo', 0) == -156908512
    assert murmurhash3_32('foo', 42) == -1322301282

    assert murmurhash3_32('foo', 0, positive=True) == 4138058784
    assert murmurhash3_32('foo', 42, positive=True) == 2972666014

コード例 #16

0

ファイルを表示

ファイル: plot_tracks.py プロジェクト: subhacom/argos

def plot_tracks(trackfile, ms=5, lw=5, show_bbox=True,
                bbox_alpha=(0.0, 1.0), plot_alpha=1.0, quiver=True,
                qcmap='hot', qwidth=-1, vidfile=None,
                frame=-1, fstart=0, fend=-1,
                gray=False,
                randcolor=True,
                axes=False):
    if trackfile.endswith('.csv'):
        tracks = pd.read_csv(trackfile)
    else:
        tracks = pd.read_hdf(trackfile, 'tracked')
    tracks.describe()
    # print('%%%%', bbox_alpha)
    img = None
    fig, ax = plt.subplots()
    if vidfile is not None:
        cap = cv2.VideoCapture(vidfile)
        if frame < 0:
            frame = cap.get(cv2.CAP_PROP_FRAME_COUNT) - 1
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame))
        ret, img = cap.read()
        if img is None:
            print('Could not read image')
        elif img.shape[-1] == 3:  # BGR
            if gray:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        elif len(img.shape) == 2:
            gray = True
    if img is not None:
        if gray:
            ax.imshow(img, origin='upper', cmap='gray')
        else:
            ax.imshow(img, origin='upper')
        if not axes:
            ax.xaxis.set_visible(False)
            ax.yaxis.set_visible(False)
            [ax.spines[s].set_visible(False)
             for s in ['left', 'bottom', 'top', 'right']]

    print('Unique tracks:', len(tracks.trackid.unique()))
    if fend < 0:
        fend = tracks.frame.max()
    tracks = tracks[(tracks.frame >= fstart) & (tracks.frame <= fend)].copy()
    for trackid, trackgrp in tracks.groupby('trackid'):
        pos = trackgrp.sort_values(by='frame')
        cx = pos.x + pos.w / 2.0
        # The Y axis is inverted when using image.
        # Keep it consistent when no image is used.
        if img is None:
            cy = - (pos.y + pos.h / 2.0)
        else:
            cy = pos.y + pos.h / 2.0

        val = murmurhash3_32(int(trackid), positive=True).to_bytes(8, 'little')
        color = (val[0] / 255.0, val[1] / 255.0, val[2] / 255.0)
        if show_bbox:
            alpha = np.linspace(bbox_alpha[0], bbox_alpha[1], len(pos))
            ii = 0
            for p in pos.itertuples():
                bbox = plt.Rectangle((p.x, p.y),
                                     p.w, p.h,
                                     linewidth=lw,
                                     edgecolor=color,
                                     facecolor='none',
                                     alpha=alpha[ii])
                ii += 1
                ax.add_patch(bbox)

        if quiver:
            u = np.diff(cx)
            v = np.diff(cy)
            c = np.linspace(0, 1, len(u))
            if qwidth <= 0:
                ax.quiver(cx[:-1], cy[:-1], u, v, c,
                          scale_units='xy', angles='xy',
                          scale=1, cmap=qcmap)
            else:
                ax.quiver(cx[:-1], cy[:-1], u, v, c,
                          units='xy',
                          scale_units='xy', angles='xy',
                          scale=1, width=qwidth, cmap=qcmap)
        elif randcolor:
            plt.plot(cx, cy, '.-', color=color, ms=ms, alpha=plot_alpha,
                     label=str(trackid))
        else:
            plt.plot(cx, cy, '.-', ms=ms, alpha=plot_alpha,
                     label=str(trackid))
    fig.tight_layout()
    return fig

コード例 #17

0

ファイルを表示

def hash(gram, hash_size):
    """ 把gram hash到有限的hash_size空间 """
    return murmurhash3_32(gram, positive=True) % hash_size

コード例 #18

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: Aathi410/Pro123

def test_mmhash3_bytes():
    assert murmurhash3_32(b"foo", 0) == -156908512
    assert murmurhash3_32(b"foo", 42) == -1322301282

    assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
    assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014

コード例 #19

0

ファイルを表示

def test_mmhash3_unicode():
    assert_equal(murmurhash3_32(u('foo'), 0), -156908512)
    assert_equal(murmurhash3_32(u('foo'), 42), -1322301282)

    assert_equal(murmurhash3_32(u('foo'), 0, positive=True), 4138058784)
    assert_equal(murmurhash3_32(u('foo'), 42, positive=True), 2972666014)

コード例 #20

0

ファイルを表示

ファイル: test_murmurhash.py プロジェクト: devs1991/test_edx_docmode

def test_mmhash3_bytes():
    assert_equal(murmurhash3_32('foo', 0), -156908512)
    assert_equal(murmurhash3_32('foo', 42), -1322301282)

    assert_equal(murmurhash3_32('foo', 0, positive=True), 4138058784L)
    assert_equal(murmurhash3_32('foo', 42, positive=True), 2972666014L)