def one_hot(loader, output, cutoff=0, encoder=None, all_pairs=None): logging.info("Loading raw data") sfs = loader['sfeatures'] logging.info("Generating pair features") to_pairs = sfs #np.hstack((bfs,sfs)) if all_pairs is None: all_pairs = dict() pairs = pair_features(to_pairs, all_pairs) ffs = loader['ffeatures'] ifs = loader['ifeatures'] bfs = loader['bfeatures'] logging.info("Concatenating all features") X = np.hstack((to_pairs, bfs, ffs, ifs, pairs)) del to_pairs, bfs, ffs, ifs, sfs, pairs if encoder is None: logging.info("Making new one-hot encoder") encoder = OneHotEncoder() encoder.fit(X, cutoff) logging.info("One-hot encoding data") X_hot = encoder.transform(X) del X logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot) return encoder, all_pairs
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] X = np.hstack((bfs, ffs, ifs, sfs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] ij = np.zeros((2, nrows * ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 100000 == 0: logging.debug(i) start = i * ncols end = (i + 1) * ncols ij[0, start:end] = i for j, x in enumerate(row): ij[1, start + j] = murmurhash3_32('%d_%s' % (j, x), seed=42, positive=True) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot)
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] npaircols = bfs.shape[1] + sfs.shape[1] X = np.hstack((bfs, sfs, ifs)) del bfs, ifs, sfs X2 = sparse.csr_matrix(loader['ffeatures']) nrows = X.shape[0] ncols = X.shape[1] + npaircols * (npaircols - 1) / 2 ij = np.zeros((2, nrows * ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 50000 == 0: logging.debug(i) start = i * ncols end = (i + 1) * ncols ij[0, start:end] = i for j, x in enumerate(row): ij[1, start + j] = murmurhash3_32('%d_%s' % (j, x), seed=42, positive=True) % d j += start for j1 in xrange(npaircols): for j2 in xrange(j1): j += 1 ij[1, j] = murmurhash3_32('%d_%s_x_%d_%s' % (j1, row[j1], j2, row[j2]), seed=42, positive=True) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) X = sparse.hstack((X_hot, X2)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X)
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] npaircols = bfs.shape[1] + sfs.shape[1] X = np.hstack((bfs, sfs, ifs)) del bfs, ifs, sfs X2 = sparse.csr_matrix(loader['ffeatures']) nrows = X.shape[0] ncols = X.shape[1] + npaircols*(npaircols-1)/2 ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 50000 == 0: logging.debug(i) start = i * ncols end = (i+1) * ncols ij[0,start:end] = i for j, x in enumerate(row): ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % d j += start for j1 in xrange(npaircols): for j2 in xrange(j1): j += 1 ij[1,j] = murmurhash3_32('%d_%s_x_%d_%s' % (j1,row[j1],j2,row[j2]), seed=42, positive=True) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) X = sparse.hstack((X_hot, X2)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X)
def raw_encode(loader, output): logging.info("Loading data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] X = np.hstack((bfs, ffs, ifs, sfs)) logging.info("Saving data") save_encoded_features(output, X)
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] npaircols = sfs.shape[1] + bfs.shape[1] X = np.hstack((sfs, bfs, ifs, ffs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] + npaircols * (npaircols - 1) / 2 ij = np.zeros((2, nrows * ncols), dtype=int) # row, col indices #hasher = pyhash.murmur3_32() for i, row in enumerate(X): if i % 50000 == 0: logging.debug(i) gc.collect() start = i * ncols end = (i + 1) * ncols ij[0, start:end] = i for j, x in enumerate(row): #ij[1,start+j] = abs(mmh3.hash(str((j,x)), 42)) % d ij[1, start + j] = murmurhash3_32(str( (j, x)), seed=42, positive=True) % d #ij[1,start+j] = abs(hasher(str(j), str(x), seed=42)) % d #ij[1,start+j] = abs(hash((j,x))) % d j += start for j1 in xrange(npaircols): for j2 in xrange(j1): j += 1 #ij[1,j] = abs(mmh3.hash(str((j1,row[j1],j2,row[j2])), # seed=42)) % d ij[1, j] = murmurhash3_32(str((j1, row[j1], j2, row[j2])), seed=42, positive=True) % d #ij[1,j] = abs(hasher(str(j1), str(row[j1]), str(j2), str(row[j2]), # seed=42)) % d #ij[1,j] = abs(hash((j1,row[j1],j2,row[j2]))) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot)
def one_hot(loader, output, cutoff=0, encoder=None, decimals=2): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = np.around(loader['ffeatures'], decimals) ifs = loader['ifeatures'] sfs = loader['sfeatures'] X = np.hstack((bfs, ffs, ifs, sfs)) del bfs, ffs, ifs, sfs if encoder is None: logging.info("Making new one-hot encoder") encoder = OneHotEncoder() encoder.fit(X, cutoff) logging.info("One-hot encoding data") X_hot = encoder.transform(X) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot) return encoder
def one_hot(loader, output, cutoff=0, encoder=None): logging.info("Loading raw data") bfs = loader['bfeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] X1 = np.hstack((bfs, ifs, sfs)) del bfs, ifs, sfs X2 = sparse.csr_matrix(loader['ffeatures']) if encoder is None: logging.info("Making new one-hot encoder") encoder = OneHotEncoder() encoder.fit(X1, cutoff) logging.info("One-hot encoding data") X_hot = encoder.transform(X1) X = sparse.hstack((X_hot, X2)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X) return encoder
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] npaircols = sfs.shape[1] + bfs.shape[1] X = np.hstack((sfs, bfs, ifs, ffs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] + npaircols*(npaircols-1)/2 ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices #hasher = pyhash.murmur3_32() for i, row in enumerate(X): if i % 50000 == 0: logging.debug(i) gc.collect() start = i * ncols end = (i+1) * ncols ij[0,start:end] = i for j, x in enumerate(row): #ij[1,start+j] = abs(mmh3.hash(str((j,x)), 42)) % d ij[1,start+j] = murmurhash3_32(str((j,x)), seed=42, positive=True) % d #ij[1,start+j] = abs(hasher(str(j), str(x), seed=42)) % d #ij[1,start+j] = abs(hash((j,x))) % d j += start for j1 in xrange(npaircols): for j2 in xrange(j1): j += 1 #ij[1,j] = abs(mmh3.hash(str((j1,row[j1],j2,row[j2])), # seed=42)) % d ij[1,j] = murmurhash3_32(str((j1,row[j1],j2,row[j2])), seed=42, positive=True) % d #ij[1,j] = abs(hasher(str(j1), str(row[j1]), str(j2), str(row[j2]), # seed=42)) % d #ij[1,j] = abs(hash((j1,row[j1],j2,row[j2]))) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot)
def one_hot_hash(loader, output, d): logging.info("Loading raw data") bfs = loader['bfeatures'] ffs = loader['ffeatures'] ifs = loader['ifeatures'] sfs = loader['sfeatures'] X = np.hstack((bfs, ffs, ifs, sfs)) del bfs, ffs, ifs, sfs nrows = X.shape[0] ncols = X.shape[1] ij = np.zeros((2, nrows*ncols), dtype=int) # row, col indices for i, row in enumerate(X): if i % 100000 == 0: logging.debug(i) start = i * ncols end = (i+1) * ncols ij[0,start:end] = i for j, x in enumerate(row): ij[1,start+j] = murmurhash3_32('%d_%s' % (j,x), seed=42, positive=True) % d data = np.ones(ij.shape[1]) # all ones X_hot = sparse.csr_matrix((data, ij), shape=(nrows, d)) logging.info("Saving one-hotted data to output") save_encoded_features(output, X_hot)