def preprocess_and_dump_hd5(in_csv, X_file, scaler, chunksize=100000): # Get number of lines in the CSV file nlines = subprocess.check_output('wc -l %s' % in_csv, shell=True) nlines = int(nlines.split()[0]) # Get header df = pd.read_csv(in_csv, nrows=1) header_row = df.columns filter_out = [ 'id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal' ] df = add_features(df) features = list(f for f in df.columns if f not in filter_out) with h5py.File(X_file, "w") as f: X = f.create_dataset("X", (nlines - 1, len(features)), dtype='float64') # Iteratively read CSV for k, i in enumerate(range(1, nlines, chunksize)): print("iteration {}, line {}".format(k, i)) df = pd.read_csv( in_csv, header=None, # no header nrows=chunksize, # number of rows to read at each iteration names=header_row, # set header skiprows=i) # skip rows that were already read df = add_features(df) data = scaler.transform(df[features]) X[i - 1:i - 1 + chunksize, :] = data
def preprocess_and_dump_hd5(in_csv, pf_csv, X_file, scaler, chunksize=100000): # Get number of lines in the CSV file nlines = subprocess.check_output('wc -l %s' % in_csv, shell=True) nlines = int(nlines.split()[0]) # Get header df = pd.read_csv(in_csv, nrows=1) header_row = df.columns filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal'] df = add_features(df) features = list(f for f in df.columns if f not in filter_out) with h5py.File(X_file, "w") as f: X = f.create_dataset("X", (nlines-1,len(features)+1), dtype='float64') # Iteratively read CSV for k,i in enumerate(range(1, nlines, chunksize)): print("iteration {}, line {}".format(k, i)) df = pd.read_csv(in_csv, header=None, # no header nrows=chunksize, # number of rows to read at each iteration names=header_row, # set header skiprows=i) # skip rows that were already read pf = pd.read_csv(pf_csv, header=None, # no header nrows=chunksize, # number of rows to read at each iteration names=["id", "prediction"], # set header skiprows=i) # skip rows that were already read df = add_features(df) data = scaler.transform(df[features]) data = np.hstack((data, pf['prediction'].values.reshape(-1,1))) X[i-1:i-1+chunksize,:] = data
def load_data(train_file, shuffle=False, seed=None): print("Load the training data") df = pd.read_csv(train_file) df = add_features(df) if shuffle: if seed is not None: np.random.seed(seed) # seed to shuffle the train set df = df.iloc[np.random.permutation(len(df))].reset_index(drop=True) filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'signal'] features = list(f for f in df.columns if f not in filter_out) return df[features].values, df['signal'].values, features
def load(data_file, prediction_file, tail=None, weight=False, mass=False): data = pd.read_csv(data_file) data = add_features(data) prediction = pd.read_csv(prediction_file) data['prediction'] = prediction["prediction"] if tail is not None: data = data[-tail:] # shuffle data = data.iloc[np.random.permutation(len(data))].reset_index(drop=True) filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal'] features = list(f for f in data.columns if f not in filter_out) X = data[features].values y = data['signal'].values if not mass else None w = data['weight'].values if weight else None m = data['mass'].values if mass else None return X, y, w, m
def load(data_file, prediction_file, tail=None, weight=False, mass=False): data = pd.read_csv(data_file) data = add_features(data) prediction = pd.read_csv(prediction_file) data['prediction'] = prediction["prediction"] if tail is not None: data = data[-tail:] # shuffle data = data.iloc[np.random.permutation(len(data))].reset_index(drop=True) filter_out = [ 'id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal' ] features = list(f for f in data.columns if f not in filter_out) X = data[features].values y = data['signal'].values if not mass else None w = data['weight'].values if weight else None m = data['mass'].values if mass else None return X, y, w, m