def one_hot(loader, output, cutoff=0, encoder=None, all_pairs=None):
    logging.info("Loading raw data")
    sfs = loader['sfeatures']
    logging.info("Generating pair features")
    to_pairs = sfs #np.hstack((bfs,sfs))
    if all_pairs is None:
        all_pairs = dict()
    pairs = pair_features(to_pairs, all_pairs)
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    bfs = loader['bfeatures']
    logging.info("Concatenating all features")
    X = np.hstack((to_pairs, bfs, ffs, ifs, pairs))
    del to_pairs, bfs, ffs, ifs, sfs, pairs
    
    if encoder is None:
        logging.info("Making new one-hot encoder")
        encoder = OneHotEncoder()
        encoder.fit(X, cutoff)
    logging.info("One-hot encoding data")
    X_hot = encoder.transform(X)
    del X
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
    return encoder, all_pairs
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X = np.hstack((bfs, ffs, ifs, sfs))
    del bfs, ffs, ifs, sfs

    nrows = X.shape[0]
    ncols = X.shape[1]
    ij = np.zeros((2, nrows * ncols), dtype=int)  # row, col indices
    for i, row in enumerate(X):
        if i % 100000 == 0: logging.debug(i)
        start = i * ncols
        end = (i + 1) * ncols
        ij[0, start:end] = i
        for j, x in enumerate(row):
            ij[1, start +
               j] = murmurhash3_32('%d_%s' %
                                   (j, x), seed=42, positive=True) % d
    data = np.ones(ij.shape[1])  # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))

    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
Beispiel #3
0
def one_hot(loader, output, cutoff=0, encoder=None, all_pairs=None):
    logging.info("Loading raw data")
    sfs = loader['sfeatures']
    logging.info("Generating pair features")
    to_pairs = sfs  #np.hstack((bfs,sfs))
    if all_pairs is None:
        all_pairs = dict()
    pairs = pair_features(to_pairs, all_pairs)
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    bfs = loader['bfeatures']
    logging.info("Concatenating all features")
    X = np.hstack((to_pairs, bfs, ffs, ifs, pairs))
    del to_pairs, bfs, ffs, ifs, sfs, pairs

    if encoder is None:
        logging.info("Making new one-hot encoder")
        encoder = OneHotEncoder()
        encoder.fit(X, cutoff)
    logging.info("One-hot encoding data")
    X_hot = encoder.transform(X)
    del X
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
    return encoder, all_pairs
Beispiel #4
0
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    npaircols = bfs.shape[1] + sfs.shape[1]
    X = np.hstack((bfs, sfs, ifs))
    del bfs, ifs, sfs
    X2 = sparse.csr_matrix(loader['ffeatures'])

    nrows = X.shape[0]
    ncols = X.shape[1] + npaircols * (npaircols - 1) / 2
    ij = np.zeros((2, nrows * ncols), dtype=int)  # row, col indices
    for i, row in enumerate(X):
        if i % 50000 == 0: logging.debug(i)
        start = i * ncols
        end = (i + 1) * ncols
        ij[0, start:end] = i
        for j, x in enumerate(row):
            ij[1, start +
               j] = murmurhash3_32('%d_%s' %
                                   (j, x), seed=42, positive=True) % d
        j += start
        for j1 in xrange(npaircols):
            for j2 in xrange(j1):
                j += 1
                ij[1, j] = murmurhash3_32('%d_%s_x_%d_%s' %
                                          (j1, row[j1], j2, row[j2]),
                                          seed=42,
                                          positive=True) % d
    data = np.ones(ij.shape[1])  # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))
    X = sparse.hstack((X_hot, X2))
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X)
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    npaircols = bfs.shape[1] + sfs.shape[1]
    X = np.hstack((bfs, sfs, ifs))
    del bfs, ifs, sfs
    X2 = sparse.csr_matrix(loader['ffeatures'])
    
    nrows = X.shape[0]
    ncols = X.shape[1] + npaircols*(npaircols-1)/2
    ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices
    for i, row in enumerate(X):
        if i % 50000 == 0: logging.debug(i)
        start = i * ncols
        end = (i+1) * ncols
        ij[0,start:end] = i
        for j, x in enumerate(row):
            ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % d
        j += start
        for j1 in xrange(npaircols):
            for j2 in xrange(j1):
                j += 1
                ij[1,j] = murmurhash3_32('%d_%s_x_%d_%s' % (j1,row[j1],j2,row[j2]), 
                                         seed=42, positive=True) % d
    data = np.ones(ij.shape[1]) # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))
    X = sparse.hstack((X_hot, X2))
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X)
def raw_encode(loader, output):
    logging.info("Loading data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X = np.hstack((bfs, ffs, ifs, sfs))
    logging.info("Saving data")
    save_encoded_features(output, X)
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    npaircols = sfs.shape[1] + bfs.shape[1]
    X = np.hstack((sfs, bfs, ifs, ffs))
    del bfs, ffs, ifs, sfs

    nrows = X.shape[0]
    ncols = X.shape[1] + npaircols * (npaircols - 1) / 2
    ij = np.zeros((2, nrows * ncols), dtype=int)  # row, col indices
    #hasher = pyhash.murmur3_32()
    for i, row in enumerate(X):
        if i % 50000 == 0:
            logging.debug(i)
            gc.collect()
        start = i * ncols
        end = (i + 1) * ncols
        ij[0, start:end] = i
        for j, x in enumerate(row):
            #ij[1,start+j] = abs(mmh3.hash(str((j,x)), 42)) % d
            ij[1,
               start + j] = murmurhash3_32(str(
                   (j, x)), seed=42, positive=True) % d
            #ij[1,start+j] = abs(hasher(str(j), str(x), seed=42)) % d
            #ij[1,start+j] = abs(hash((j,x))) % d
        j += start
        for j1 in xrange(npaircols):
            for j2 in xrange(j1):
                j += 1
                #ij[1,j] = abs(mmh3.hash(str((j1,row[j1],j2,row[j2])),
                #                         seed=42)) % d
                ij[1, j] = murmurhash3_32(str((j1, row[j1], j2, row[j2])),
                                          seed=42,
                                          positive=True) % d
                #ij[1,j] = abs(hasher(str(j1), str(row[j1]), str(j2), str(row[j2]),
                #                     seed=42)) % d
                #ij[1,j] = abs(hash((j1,row[j1],j2,row[j2]))) % d
    data = np.ones(ij.shape[1])  # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))

    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
def one_hot(loader, output, cutoff=0, encoder=None, decimals=2):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = np.around(loader['ffeatures'], decimals)
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X = np.hstack((bfs, ffs, ifs, sfs))
    del bfs, ffs, ifs, sfs
    
    if encoder is None:
        logging.info("Making new one-hot encoder")
        encoder = OneHotEncoder()
        encoder.fit(X, cutoff)
    logging.info("One-hot encoding data")
    X_hot = encoder.transform(X)
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
    return encoder
Beispiel #9
0
def one_hot(loader, output, cutoff=0, encoder=None, decimals=2):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = np.around(loader['ffeatures'], decimals)
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X = np.hstack((bfs, ffs, ifs, sfs))
    del bfs, ffs, ifs, sfs

    if encoder is None:
        logging.info("Making new one-hot encoder")
        encoder = OneHotEncoder()
        encoder.fit(X, cutoff)
    logging.info("One-hot encoding data")
    X_hot = encoder.transform(X)
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
    return encoder
def one_hot(loader, output, cutoff=0, encoder=None):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X1 = np.hstack((bfs, ifs, sfs))
    del bfs, ifs, sfs
    X2 = sparse.csr_matrix(loader['ffeatures'])
    
    if encoder is None:
        logging.info("Making new one-hot encoder")
        encoder = OneHotEncoder()
        encoder.fit(X1, cutoff)
    logging.info("One-hot encoding data")
    X_hot = encoder.transform(X1)
    X = sparse.hstack((X_hot, X2))
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X)
    return encoder
Beispiel #11
0
def one_hot(loader, output, cutoff=0, encoder=None):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X1 = np.hstack((bfs, ifs, sfs))
    del bfs, ifs, sfs
    X2 = sparse.csr_matrix(loader['ffeatures'])

    if encoder is None:
        logging.info("Making new one-hot encoder")
        encoder = OneHotEncoder()
        encoder.fit(X1, cutoff)
    logging.info("One-hot encoding data")
    X_hot = encoder.transform(X1)
    X = sparse.hstack((X_hot, X2))
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X)
    return encoder
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    npaircols = sfs.shape[1] + bfs.shape[1]
    X = np.hstack((sfs, bfs, ifs, ffs))
    del bfs, ffs, ifs, sfs
    
    nrows = X.shape[0]
    ncols = X.shape[1] + npaircols*(npaircols-1)/2
    ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices
    #hasher = pyhash.murmur3_32()
    for i, row in enumerate(X):
        if i % 50000 == 0: 
            logging.debug(i)
            gc.collect()
        start = i * ncols
        end = (i+1) * ncols
        ij[0,start:end] = i
        for j, x in enumerate(row):
            #ij[1,start+j] = abs(mmh3.hash(str((j,x)), 42)) % d
            ij[1,start+j] = murmurhash3_32(str((j,x)), seed=42, positive=True) % d
            #ij[1,start+j] = abs(hasher(str(j), str(x), seed=42)) % d
            #ij[1,start+j] = abs(hash((j,x))) % d
        j += start
        for j1 in xrange(npaircols):
            for j2 in xrange(j1):
                j += 1
                #ij[1,j] = abs(mmh3.hash(str((j1,row[j1],j2,row[j2])), 
                #                         seed=42)) % d
                ij[1,j] = murmurhash3_32(str((j1,row[j1],j2,row[j2])), 
                                         seed=42, positive=True) % d
                #ij[1,j] = abs(hasher(str(j1), str(row[j1]), str(j2), str(row[j2]), 
                #                     seed=42)) % d
                #ij[1,j] = abs(hash((j1,row[j1],j2,row[j2]))) % d
    data = np.ones(ij.shape[1]) # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))
    
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)
def one_hot_hash(loader, output, d):
    logging.info("Loading raw data")
    bfs = loader['bfeatures']
    ffs = loader['ffeatures']
    ifs = loader['ifeatures']
    sfs = loader['sfeatures']
    X = np.hstack((bfs, ffs, ifs, sfs))
    del bfs, ffs, ifs, sfs
    
    nrows = X.shape[0]
    ncols = X.shape[1]
    ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices
    for i, row in enumerate(X):
        if i % 100000 == 0: logging.debug(i)
        start = i * ncols
        end = (i+1) * ncols
        ij[0,start:end] = i
        for j, x in enumerate(row):
            ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % d
    data = np.ones(ij.shape[1]) # all ones
    X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d))
    
    logging.info("Saving one-hotted data to output")
    save_encoded_features(output, X_hot)