コード例 #1
0
def preprocess_and_dump_hd5(in_csv, X_file, scaler, chunksize=100000):
    # Get number of lines in the CSV file
    nlines = subprocess.check_output('wc -l %s' % in_csv, shell=True)
    nlines = int(nlines.split()[0])

    # Get header
    df = pd.read_csv(in_csv, nrows=1)
    header_row = df.columns
    filter_out = [
        'id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal'
    ]
    df = add_features(df)
    features = list(f for f in df.columns if f not in filter_out)

    with h5py.File(X_file, "w") as f:
        X = f.create_dataset("X", (nlines - 1, len(features)), dtype='float64')
        # Iteratively read CSV
        for k, i in enumerate(range(1, nlines, chunksize)):
            print("iteration {}, line {}".format(k, i))
            df = pd.read_csv(
                in_csv,
                header=None,  # no header
                nrows=chunksize,  # number of rows to read at each iteration
                names=header_row,  # set header
                skiprows=i)  # skip rows that were already read
            df = add_features(df)
            data = scaler.transform(df[features])
            X[i - 1:i - 1 + chunksize, :] = data
def preprocess_and_dump_hd5(in_csv, pf_csv, X_file, scaler, chunksize=100000):
    # Get number of lines in the CSV file
    nlines = subprocess.check_output('wc -l %s' % in_csv, shell=True)
    nlines = int(nlines.split()[0])

    # Get header
    df = pd.read_csv(in_csv, nrows=1) 
    header_row = df.columns
    filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal']
    df  = add_features(df)
    features = list(f for f in df.columns if f not in filter_out)
    
    with h5py.File(X_file, "w") as f:
        X = f.create_dataset("X", (nlines-1,len(features)+1), dtype='float64')
        # Iteratively read CSV
        for k,i in enumerate(range(1, nlines, chunksize)):
            print("iteration {}, line {}".format(k, i))
        
            df = pd.read_csv(in_csv,  
                    header=None,  # no header
                    nrows=chunksize, # number of rows to read at each iteration
                    names=header_row, # set header
                    skiprows=i)   # skip rows that were already read
            pf = pd.read_csv(pf_csv,  
                    header=None,  # no header
                    nrows=chunksize, # number of rows to read at each iteration
                    names=["id", "prediction"], # set header
                    skiprows=i)   # skip rows that were already read
            df  = add_features(df)
            data = scaler.transform(df[features])
            data = np.hstack((data, pf['prediction'].values.reshape(-1,1)))
            X[i-1:i-1+chunksize,:] = data
コード例 #3
0
def load_data(train_file, shuffle=False, seed=None):

    print("Load the training data")
    df = pd.read_csv(train_file)
    df = add_features(df)
    if shuffle:
        if seed is not None:
            np.random.seed(seed)  # seed to shuffle the train set
        df = df.iloc[np.random.permutation(len(df))].reset_index(drop=True)
    filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'signal']
    features = list(f for f in df.columns if f not in filter_out)
    return df[features].values, df['signal'].values, features
コード例 #4
0
def load(data_file, prediction_file, tail=None, weight=False, mass=False):
    data = pd.read_csv(data_file)
    data = add_features(data)
    prediction = pd.read_csv(prediction_file)
    data['prediction'] = prediction["prediction"]
    if tail is not None:
        data = data[-tail:]

    # shuffle
    data = data.iloc[np.random.permutation(len(data))].reset_index(drop=True)

    filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal']
    features = list(f for f in data.columns if f not in filter_out)
    X = data[features].values
    y = data['signal'].values if not mass else None
    w = data['weight'].values if weight else None
    m = data['mass'].values if mass else None
    return X, y, w, m
コード例 #5
0
def load(data_file, prediction_file, tail=None, weight=False, mass=False):
    data = pd.read_csv(data_file)
    data = add_features(data)
    prediction = pd.read_csv(prediction_file)
    data['prediction'] = prediction["prediction"]
    if tail is not None:
        data = data[-tail:]

    # shuffle
    data = data.iloc[np.random.permutation(len(data))].reset_index(drop=True)

    filter_out = [
        'id', 'min_ANNmuon', 'production', 'mass', 'weight', 'signal'
    ]
    features = list(f for f in data.columns if f not in filter_out)
    X = data[features].values
    y = data['signal'].values if not mass else None
    w = data['weight'].values if weight else None
    m = data['mass'].values if mass else None
    return X, y, w, m