Exemple #1
0
def test_scoring(weights,multiprocessing=False):
    print('Weights: ')
    print('Barrel\t' + str(weights[0]))
    print('Caps  \t' + str(weights[1]))
    print('-'*50)
    #weights = np.insert(weights, 0, 1)
    event_prefix = "event00000%d"
    if multiprocessing:
        jobs = []
        pool=mp.Pool(processes=4) 
    else:
        test_dataset_submissions = []
        scores = []
    for event in [1000 + i for i in range(1)]:
        event_id = event_prefix % event
        hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_id))
        if multiprocessing:
            jobs.append(pool.apply_async(add_submission, args=(event,hits,weights,True,truth,True)))
        else:
            results = add_submission(event,hits,weights,True,truth,False)
            test_dataset_submissions.append(results[0])
            scores.append(results[1])
    
    if multiprocessing:
        test_dataset_submissions = [job.get()[0] for job in jobs]
        scores = [job.get()[1] for job in jobs]

    print('Avg of events: ')
    avg_score = sum(scores)/float(len(scores))
    print(avg_score)
    return avg_score
def test_events():
    ''' Process a few events to demonstrate the method
    before using it on the entire testing dataset.
    '''
    events = range(1000, 1012)
    path_to_train = '../../data/raw/train_100_events/'
    parts_to_load = ['hits', 'truth']

    model = Clusterer(350, 2, 40, 40)
    for event_id in events:

        with timer('Processing %s' % event_id):
            hits, truth = load_event(path_to_train + 'event00000' +
                                     str(event_id),
                                     parts=parts_to_load)

            # Add the needed information.
            r = np.sqrt(hits['x'].values**2 + hits['y'].values**2)
            d = np.sqrt(hits['x'].values**2 + hits['y'].values**2 +
                        hits['z'].values**2)
            hits['phi'] = np.arctan2(hits['y'].values, hits['x'].values)
            hits['sphi'] = np.sin(hits['phi'].values)
            hits['cphi'] = np.cos(hits['phi'].values)
            hits['zd'] = hits['z'].values / d
            hits['zr'] = hits['z'].values / r

            scaler = MinMaxScaler()
            X = scaler.fit_transform(hits[['zr', 'zd', 'sphi', 'cphi']])

            labels = model.predict(X)
            submission = create_one_event_submission(event_id, hits, labels)
            score = score_event_fast(truth, submission)
            print('score = %.4f' % score)
def test(model, test, shift=10000):
    """create prediction from train model over the test data. then append the prediction to the test data tp create the submission
    model   -- trained model
    test    -- test data
    shift   -- coordinate shift needed to eliminate negitive numbers
    """
    df_test = []

    for e in test:
        hits, cells = load_event(e, parts=['hits', 'cells'])
        hits['event_id'] = int(e[-9:])
        cells = cells.groupby(by=['hit_id'])['ch0', 'ch1',
                                             'value'].agg(['mean'
                                                           ]).reset_index()
        cells.columns = ['hit_id', 'ch0', 'ch1', 'value']
        hits = pd.merge(hits, cells, how='left', on='hit_id')
        # Pulls in all needed data from the diffrent folders on a event by even bases
        hits = pd.merge(hits, truth, how='left', on='hit_id')
        print(e, "Test")
        # the +10000 shifts the event space into a non-negitive space, the model won't take negitives
        hits['x'] += shift
        hits['y'] += shift
        hits['z'] += shift
        # Predicts the test set
        hits['particle_id'] = model.predict(hits[[
            'hit_id', 'x', 'y', 'z', 'volume_id', 'layer_id', 'module_id',
            'event_id', 'ch0', 'ch1', 'value'
        ]].values,
                                            verbose=1)
        # append the predictions onto the test data frame
        df_test.append(hits[['event_id', 'hit_id', 'particle_id']].copy())

    return df_test
Exemple #4
0
def process_single_event(event_number):
    start = time.time()
    file_name = 'event00000' + str(event_number)
    event_id = file_name
    hits_orig, cells, particles, truth = load_event('data/train_sample/' +
                                                    event_id)
    merge_by_hit_ids = hits_orig.merge(truth, how='inner', on='hit_id')
    merge_by_particle_ids = merge_by_hit_ids.merge(particles,
                                                   how='inner',
                                                   on='particle_id')
    partid_dict = {}
    hitloc_dict = {}
    for row in merge_by_particle_ids.itertuples():
        particleID = row.__getattribute__('particle_id')
        volID = row.__getattribute__('volume_id')
        layerID = row.__getattribute__('layer_id')
        modID = row.__getattribute__('module_id')
        hitID = row.__getattribute__('hit_id')
        key_name = event_id + '-' + str(particleID)
        hitloc_dict = {
            'hit_id': hitID,
            'volume_id': volID,
            'layer_id': layerID,
            'module_id': modID
        }
        if key_name in partid_dict:
            partid_dict[key_name].append(hitloc_dict)
        else:
            partid_dict[key_name] = [hitloc_dict]

    with open('mappings/' + event_id + '.json', 'w') as outfile:
        json.dump(partid_dict, outfile)

    end = time.time()
    return partid_dict
Exemple #5
0
    def read(self, evtid):
        prefix = os.path.join(os.path.expandvars(self._evt_dir),
                              'event{:09d}'.format(evtid))

        all_data = load_event(prefix,
                              parts=['hits', 'particles', 'truth', 'cells'])
        if all_data is None:
            return False

        hits, particles, truth, cells = all_data
        hits = hits.assign(evtid=evtid)

        if self._blacklist_dir:
            prefix_bl = os.path.join(os.path.expandvars(self._blacklist_dir),
                                     'event{:09d}-blacklist_'.format(evtid))
            hits_exclude = pd.read_csv(prefix_bl + 'hits.csv')
            particles_exclude = pd.read_csv(prefix_bl + 'particles.csv')
            hits = hits[~hits['hit_id'].isin(hits_exclude['hit_id'])]
            particles = particles[~particles['particle_id'].
                                  isin(particles_exclude['particle_id'])]

        ## add pT to particles
        px = particles.px
        py = particles.py
        pt = np.sqrt(px**2 + py**2)
        particles = particles.assign(pt=pt)

        self._evtid = evtid
        self._hits = hits
        self._particles = particles
        self._truth = truth
        self._cells = cells

        self.merge_truth_info_to_hits()
        return True
Exemple #6
0
def get_training_sample(path_to_data, event_names):

    events = []
    track_id = 0

    for name in event_names:
        print(name)
        # Read an event
        hits, cells, particles, truth = load_event(
            os.path.join(path_to_data, name))

        # Generate new vector of particle id
        particle_ids = truth.particle_id.values
        particle2track = {}
        for pid in np.unique(particle_ids):
            particle2track[pid] = track_id
            track_id += 1
        hits['particle_id'] = [particle2track[pid] for pid in particle_ids]

        # Collect hits
        events.append(hits)
    # Put all hits into one sample with unique tracj ids
    data = pd.concat(events, axis=0)

    return data
Exemple #7
0
    def init_event(self, event_file, first_init=False):
        if self.verbose > 0:
            print('\n\nLoading event:', event_file)

        self.hits, self.cells, self.particles, self.truth = load_event(
            event_file)

        hits_xyz = self.hits.values[:, 1:4]
        hits_r = np.sqrt(
            np.power(hits_xyz[:, 0], 2) + np.power(hits_xyz[:, 1], 2))

        hits_phi_x = np.sign(hits_xyz[:, 1]) * np.arccos(
            hits_xyz[:, 0] / hits_r)
        hits_phi_y = np.sign(-hits_xyz[:, 0]) * np.arccos(
            hits_xyz[:, 1] / hits_r)
        hits_theta = np.arctan2(hits_xyz[:, 2], hits_r)

        hits_layers = self.hits.values[:, 4:6]
        hits_layers_onehot = np.array([
            self.onehot_layer[tuple(hits_layer)] for hits_layer in hits_layers
        ])

        hits_xyzrphiphitheta = np.concatenate(
            (hits_xyz, np.reshape(hits_r,
                                  (-1, 1)), np.reshape(hits_phi_x, (-1, 1)),
             np.reshape(hits_phi_y, (-1, 1)), np.reshape(hits_theta, (-1, 1))),
            axis=1)

        if self.std_scale:
            if first_init:
                self.coord_rescaler = preprocessing.StandardScaler().fit(
                    hits_xyzrphiphitheta)
            hits_xyzrphiphitheta = self.coord_rescaler.transform(
                hits_xyzrphiphitheta)

        self.hits_input = np.append(hits_xyzrphiphitheta,
                                    hits_layers_onehot,
                                    axis=1)

        hits_module_array = self.hits.values[:, 4:]
        self.hits_module = []
        for module in hits_module_array:
            self.hits_module.append(tuple(module))

        # Collect all ids except for id 0.
        self.track_unique_ids = np.unique(
            np.append([0], self.truth.values[:, 1]))
        self.track_unique_ids = self.track_unique_ids[1:]
        rand.shuffle(self.track_unique_ids)
        self.ntracks = len(self.track_unique_ids)

        self.track_hit_dict = pickle.load(
            open(event_file + "-trackdict.p", "rb"))

        if self.verbose > 0:
            print('Finished loading event:', event_file)
Exemple #8
0
    def read_meas(self, event):
        # Read meas files.
        hits, cells = load_event(event, parts=['hits', 'cells'])

        cells = cells.groupby(by=['hit_id'])['value'].agg(['count', 'sum'
                                                           ]).reset_index()
        cells.columns = ['hit_id', 'hit_ncell', 'hit_edep']
        hits = pd.merge(hits, cells, how='left', on='hit_id')

        return hits
Exemple #9
0
def main():
    # load 90 train events data
    data_l = []
    for i in range(10, 100):
        event = '../input/train_1/event0000010%d' % i
        print('event:', event)
        hits, cells, particles, truth = load_event(event)
        data = hits
        data = data.merge(truth, how='left', on='hit_id')
        data = data.merge(particles, how='left', on='particle_id')

        # keep hits from tracks orginating from vertex
        data['rv'] = np.sqrt(data.vx**2 + data.vy**2)
        data = data[(data.rv <= 1) & (data.vz <= 50) & (data.vz >= -50)].copy()
        data = data[data.weight > 0]
        data['event_id'] = i

        data['pt'] = np.sqrt(data.px**2 + data.py**2)

        # use a simple relationship to compute alpha0 from pt, see documentaiton or EDA notebook.
        data['alpha0'] = np.exp(-8.115 - np.log(data.pt))

        data_l.append(data)

    data = pd.concat(data_l, axis=0)

    # compute track level statistics
    df = data.groupby(['event_id', 'particle_id'])[['alpha0', 'vz']].first()
    df = df.dropna()
    np.save('../data/scan_center.npy', df.values)

    # compute tracklet frequencies
    # tracklets are sub tracks of length 4

    # assign a unique layer to each hit
    data['layer'] = 100 * data.volume_id + data.layer_id

    # for each track compute a string containing the sequence of layers traversed by the track
    data = data.sort_values(by=['particle_id', 'z']).reset_index(drop=True)
    df = data.groupby(['event_id', 'particle_id'
                       ]).layer.apply(lambda s: ' '.join([str(i) for i in s]))
    df = df.to_frame('layers')

    # count each tracklet occurences
    cnt = Counter()
    for x in tqdm(df.itertuples(name=None, index=False)):
        layers = x[0].split()
        for i in range(len(layers) - 3):
            s = ' '.join(layers[i:i + 4])
            cnt[s] += 1

    #save result
    with open('../data/layers_4_center_fix.pkl', 'wb') as file:
        pkl.dump(cnt, file)
Exemple #10
0
def get_fake_tracks(event, n_sample=5000):
    hits, = load_event(event, parts=['hits'])
    fake_tracks = []
    for idx in range(n_sample):
        rand_hits = random.randint(0, 20)
        data = hits.sample(rand_hits)[['x', 'y', 'z']].values
        data = np.pad(data, ((0, 20), (0, 0)),
                      'constant',
                      constant_values=(4, 99999))[:20, :]
        fake_tracks.append(data)
    return np.array(fake_tracks)
Exemple #11
0
def visualize_prediction_v_truth():
    fig, ax = plt.subplots(figsize=(15, 15))
    ax = Axes3D(fig)
    event = 1000
    event_prefix = "event00000%d"
    weights = [0.1, 0.01, 0.1, 1, 1]
    event_id = event_prefix % event
    hits, cells, particles, truth = load_event(
        os.path.join(path_to_train, event_id))
    one_submission, score = add_submission(event, hits, weights, True, truth)
    plot_tracks(ax, one_submission.track_id.unique(), hits)
    show_3Dplot(ax)
Exemple #12
0
    def read_event(self, path, eta_cut=3.2):
        hits, cells, particles, truth = load_event(path)

        hits_features = get_features(hits)
        # apply the eta cuts on hits
        hits_features = hits_features[(hits_features['eta'] > eta_cut) |
                                      (hits_features['eta'] < -1 * eta_cut)]

        hits_with_truth = hits_features.merge(filter_truth(truth), on='hit_id')
        particles = pd.Series(np.unique(hits_with_truth['particle_id']))

        return hits_with_truth, particles
Exemple #13
0
 def open(self, use_true_coords=False):
     """Load event data from disk."""
     parts = ['hits']
     if self._with_cells:
         parts.append('cells')
     if self._with_truth:
         parts.extend(['particles', 'truth'])
     data = load_event(self._prefix, parts=parts)
     hits = data[0]
     if self._with_cells:
         self.cells_df = data[1]
     else:
         self.cells_df = None
     self.hits_df = hits
     self._validate()
     min_hit_id = hits['hit_id'].min()
     max_hit_id = hits['hit_id'].max()
     self.max_hit_id = max_hit_id
     # assume that hit_ids are exactly the integers 1,...,#hits
     # this way we can speed up the coordinate lookup quite a bit
     assert min_hit_id == 1 and max_hit_id == len(hits)
     if self._with_truth:
         particles, truth = data[-2:]
         self._particles_columns_orig = particles.columns
         self._truth_columns_orig = truth.columns
         self.particles_df = particles
         self.truth_df = truth
     else:
         assert not use_true_coords
         self.particles_df = None
         self.truth_df = None
     self.hits_coords_by_id = []
     for i, col in enumerate(('x', 'y', 'z')):
         # IMPORTANT: We convert coordinates to float64. Otherwise we run
         # into numerical precision problems in our helix arithmetic.
         # Note: index 0 must map to np.nan.
         coord = np.full(1 + max_hit_id, np.nan, dtype=np.float64)
         if use_true_coords:
             true_col = 't' + col
             coord[self.truth_df['hit_id'].
                   values] = self.truth_df[true_col].values
         else:
             coord[hits['hit_id'].values] = hits[col].values
         self.hits_coords_by_id.append(coord)
     self._hits_module_id_by_id = np.full(1 + max_hit_id,
                                          -1,
                                          dtype=np.int16)
     assert np.all(
         hits['module_id'] <= np.iinfo(self._hits_module_id_by_id.dtype).max
     )
     self._hits_module_id_by_id[
         hits['hit_id'].values] = hits['module_id'].values
     self._isopen = True
Exemple #14
0
def main():
    print('\n')
    print('-' * 50)
    print('Starting job...')
    print('-' * 50)
    t0 = time.time()

    event_id = '000001000'
    event_path = os.path.join(path_to_train, 'event' + event_id)
    hits, cells, particles, truth = load_event(event_path)

    truth = truth.merge(hits, on=['hit_id'], how='left')

    df = truth.copy()
    df = df.assign(r=np.sqrt(df.x**2 + df.y**2))
    df = df.assign(d=np.sqrt(df.x**2 + df.y**2 + df.z**2))
    df = df.assign(a=np.arctan2(df.y, df.x))
    df = df.assign(cosa=np.cos(df.a))
    df = df.assign(sina=np.sin(df.a))
    df = df.assign(phi=np.arctan2(df.z, df.r))

    fig, ax = plt.subplots(figsize=(12, 12))
    ax = Axes3D(fig)
    #ax.scatter(xs=seed.a,
    #           ys=seed.r,
    #           zs=seed.z,
    #           c='gray',
    #           s=1)

    d_delta = 200
    overlap = 0.3
    its = math.ceil((1050 / d_delta))
    for i in range(its):
        start = (i * d_delta) - (d_delta * overlap)
        if start < 0:
            start = 0
        end = (i * d_delta) + d_delta
        print(start, end)
        seed_tracks(event_id, df, start, end, ax)

    ax.set_xlabel('a')
    ax.set_ylabel('r')
    ax.set_zlabel('z  (mm)')
    plt.show()

    print('-' * 50)
    print('Success!')
    t1 = time.time()
    print('Total time', (t1 - t0) / 60)
    print('-' * 50)
    print('\n' * 2)
Exemple #15
0
def DBScan():
    data_dir = 'path'

    idd = ['***']
    sum = 0
    sum_score = 0
    for i, eve_id in enumerate(idd):
        hits, cells, particles, truth = load_event(data_dir + '/event' + eve_id)
        labels = do_dbscan_predict(hits)
        submission = create_one_event_submission(0, hits['hit_id'].values, labels)
        score = score_event(truth, submission)
        print('[%2d] score : %0.8f' % (i, score))
        sum_score += score
        sum += 1
Exemple #16
0
    def read_mc(self, event):
        # Read MC files.
        mchits, mctracks = load_event(event, parts=['truth', 'particles'])
        print(len(mctracks))

        # Fill dummy data for non-particle hits.
        mctracks = pd.concat([
            mctracks,
            pd.DataFrame([[0, 0, 0, 0, 0, 0, 0, 0, 0]],
                         columns=mctracks.columns)
        ])

        mchits = pd.merge(mchits, mctracks, how='left', on='particle_id')
        return mchits
Exemple #17
0
def analyse_event(event_number, volume_id, size_of_hit):
    hits, cells, particles, truth = load_event(
        '/Users/pjfox/Dropbox/NN/TrackML/train_100_events/event00000' +
        str(event_number))
    selectedhits, directions = goodhit_and_directions(hits, truth, volume_id)
    hitinfo = np.empty([len(selectedhits), size_of_hit])
    for ii, hit_id in enumerate(selectedhits[:, 0]):
        hitinfo[ii] = makelistofhits(cells, hits, hit_id, size_of_hit)
    np.save(
        "hits_info_Vol_" + str(volume_id) + "_event_" + str(event_number) +
        ".npy", hitinfo)
    np.save(
        "direction_info_Vol_" + str(volume_id) + "_event_" +
        str(event_number) + ".npy", directions)
    return None
Exemple #18
0
def training_data_with_eta_cut(train_dir='input/train_1',
                               event_prefix="event000001000",
                               eta_cut=3.2):
    hits, cells, particles, truth = load_event(
        os.path.join(train_dir, event_prefix))

    hits_features = get_features(hits)
    # high_eta_hits = hits_features[(hits_features['eta'] > eta_cut) | (hits_features['eta'] < -1 * eta_cut)]
    high_eta_hits = hits_features[(hits_features['eta'] > eta_cut)]
    uID_for_higheta = make_uID(high_eta_hits)
    high_eta_hits_uID = pd.merge(high_eta_hits,
                                 uID_for_higheta,
                                 on=['volume_id', 'layer_id', 'module_id'])
    train_data_higheta = high_eta_hits_uID.merge(
        filter_truth(truth), on='hit_id')[['uID', 'particle_id']]
    return train_data_higheta, uID_for_higheta
Exemple #19
0
def get_real_tracks(event):
    hits, cells, particles, truth = load_event(event)
    hits_truth = pd.merge(hits, truth, on=['hit_id'])
    pIDs = np.unique(hits_truth['particle_id'])
    track_list = []
    for pID in pIDs:
        if pID == 0:
            continue
        this_track = hits_truth[hits_truth['particle_id'] == pID][[
            'x', 'y', 'z'
        ]].values
        this_track = np.pad(this_track, ((0, 20), (0, 0)),
                            'constant',
                            constant_values=(4, 99999))[:20, :]
        track_list.append(this_track)
    return np.array(track_list)
Exemple #20
0
def dicover_highest_amount_of_hits():
    maxV = 0

    with open('/tmp/Events_Train_1') as f:
        for line in f:
            event_prefix = line.strip()
            print(event_prefix)
            hits, cells, particles, truth = load_event(
                os.path.join('/data/trackMLDB/train_1', event_prefix))

            for index, row in particles.iterrows():
                truth_0 = truth[truth.particle_id == row['particle_id']]
                AUX = truth_0['hit_id'].count()
                if (AUX > maxV):
                    maxV = AUX
                    print(maxV)
Exemple #21
0
def process_particles(event):
    hits, cells, particles, truth = load_event(event)

    # select interesting columns and calculate distance from origin 't'
    hits = hits[['hit_id', 'x', 'y', 'z']]
    hits = hits.assign(t=lambda hit: np.sqrt(hit.x**2 + hit.y**2 + hit.z**2))

    # merge datasets to associate particle_id, hit_id, layer_id, and module_id,
    # and sort by distance from the origin
    data = pd.merge(truth[['particle_id', 'hit_id']],
                    hits,
                    on='hit_id',
                    how='inner').sort_values(by=['particle_id', 't'])

    # Add a new column to each row containing the previous and next hit
    data = data.assign(x0=data['x'].shift(1),
                       x1=data['x'],
                       x2=data['x'].shift(-1),
                       y0=data['y'].shift(1),
                       y1=data['y'],
                       y2=data['y'].shift(-1))

    # a mask that selects all but the first and last rows
    def mask(x):
        result = np.ones_like(x)
        result[0] = 0
        result[x.shape[0] - 1] = 0
        return result

    # Create a mask that selects all but the first and last element from each group
    mask = data.groupby(['particle_id'
                         ])['particle_id'].transform(mask).astype(bool)

    # Applies mask to remove first and last hits from each particle and
    # calculates helix parameters for remaining hits
    data = data.loc[mask].assign(hx=lambda n: fit_circle(
        (n.x0, n.y0), (n.x1, n.y1), (n.x2, n.y2))[0],
                                 hy=lambda n: fit_circle((n.x0, n.y0),
                                                         (n.x1, n.y1),
                                                         (n.x2, n.y2))[1],
                                 hr=lambda n: fit_circle((n.x0, n.y0),
                                                         (n.x1, n.y1),
                                                         (n.x2, n.y2))[2])

    # Iterate through each group and yield it
    for name, group in data.groupby('particle_id'):
        yield (name, group[['x', 'y', 'z', 'hx', 'hy', 'hr']])
Exemple #22
0
def process_event(prefix, pt_min, n_phi_sectors, select_phi_sector,
                  phi_slope_max, phi_slope_mid_max, phi_slope_outer_max,
                  z0_max, no_missing_hits, n_tracks):
    # Load the data
    evtid = int(prefix[-9:])
    logging.info('Event %i, loading data' % evtid)
    hits, particles, truth = dataset.load_event(
        prefix, parts=['hits', 'particles', 'truth'])

    # Apply hit selection
    logging.info('Event %i, selecting hits' % evtid)
    hits = select_hits(hits,
                       truth,
                       particles,
                       pt_min=pt_min,
                       no_missing_hits=no_missing_hits).assign(evtid=evtid)
    hits_sectors = split_phi_sectors(hits,
                                     n_phi_sectors=n_phi_sectors,
                                     select_phi_sector=select_phi_sector)

    # Graph features and scale
    feature_names = ['r', 'phi', 'z']
    feature_scale = np.array([1000., np.pi / n_phi_sectors, 1000.])

    # Define adjacent layers
    n_det_layers = 10
    l = np.arange(n_det_layers)
    layer_pairs = np.stack([l[:-1], l[1:]], axis=1)

    # Construct the graph
    logging.info('Event %i, constructing graphs' % evtid)
    graphs = [
        construct_graph(sector_hits,
                        layer_pairs=layer_pairs,
                        phi_slope_max=phi_slope_max,
                        phi_slope_mid_max=phi_slope_mid_max,
                        phi_slope_outer_max=phi_slope_outer_max,
                        z0_max=z0_max,
                        feature_names=feature_names,
                        feature_scale=feature_scale,
                        max_tracks=n_tracks,
                        no_missing_hits=no_missing_hits)
        for sector_hits in hits_sectors
    ]

    return graphs
Exemple #23
0
def train(model, train, shift=10000, polar=0):
    """Takes in the training data, cleans it, and the uses it to train the model
    params
    model -- created, but untrained model
    train -- training data set
    shift -- coordinate shift needed to eliminate negative numbers
    polar -- used to turn polar coordinate on and off
    """
    for e in train:
        #seprates out the hits, cells, and truth from the data
        hits, cells, truth = load_event(e, parts=['hits', 'cells', 'truth'])
        hits['event_id'] = int(e[-9:])
        #group the data by hit ID sorted by channel and mean value
        cells = cells.groupby(by=['hit_id'])['ch0', 'ch1',
                                             'value'].agg(['mean'
                                                           ]).reset_index()
        cells.columns = ['hit_id', 'ch0', 'ch1', 'value']
        hits = pd.merge(hits, cells, how='left', on='hit_id')
        hits = pd.merge(hits, truth, how='left', on='hit_id')
        print(e, "Train")
        #applies the shift to eliminate negitive numbers
        hits['x'] += shift
        hits['y'] += shift
        hits['z'] += shift
        #If polar coordinate are turned on, calculate the radious and theta and use them to train the model
        if polar == 1:
            hits['radius'] = np.sqrt(hits['y'] * hits['y'] +
                                     hits['x'] * hits['x'])
            hits['theta'] = np.tan(hits['y'] / hits['x'])**-1
            hits['particle_id'] = model.fit(hits[[
                'hit_id', 'radius', 'theta', 'z', 'volume_id', 'layer_id',
                'module_id', 'event_id', 'ch0', 'ch1', 'value'
            ]].values,
                                            hits['particle_id'],
                                            batch_size=10000,
                                            epochs=1)
        else:
            #Train the model on the non polar x, y, and z values
            hits['particle_id'] = model.fit(hits[[
                'hit_id', 'x', 'y', 'z', 'volume_id', 'layer_id', 'module_id',
                'event_id', 'ch0', 'ch1', 'value'
            ]].values,
                                            hits['particle_id'],
                                            batch_size=10000,
                                            epochs=1)
    return model
def work_sub(param):
    # merges tracks for event i and saves result into a file
    (i, ) = param
    th_b = 0.16
    th_c = 0.45

    event = get_event(i)
    print('event:', event)
    hits, cells = load_event('../input/test/' + event, parts=['hits', 'cells'])
    data = pd.read_csv('../submissions/final/' + event)
    data = data.merge(hits, how='left', on='hit_id')
    inner = pd.read_csv('../submissions/final_inner/' + event)
    data = merge_track(data, inner, th_b, th_c, 1)
    data['event_id'] = i
    data[['event_id', 'hit_id',
          'track_id']].to_csv('../submissions/merge_final/' + event + '.csv',
                              index=False)
    return i
Exemple #25
0
def test_scoring():
    event_prefix = "event00000%d"
    jobs = []
    weights = [0.1, 0.01, 0.1, 1, 1]
    pool = mp.Pool(processes=1)
    for event in [1000 + i for i in range(1)]:
        event_id = event_prefix % event
        hits, cells, particles, truth = load_event(
            os.path.join(path_to_train, event_id))
        jobs.append(
            pool.apply_async(add_submission,
                             args=(event, hits, weights, True, truth)))

    test_dataset_submissions = [job.get()[0] for job in jobs]
    scores = [job.get()[1] for job in jobs]

    print('Avg of events: ')
    print(sum(scores) / float(len(scores)))
def read(data_dir, evtid, info=False):
    prefix = os.path.join(os.path.expandvars(data_dir), 'event{:09d}'.format(evtid))

    all_data = load_event(prefix, parts=['hits', 'particles', 'truth', 'cells'])
    if all_data is None:
        return None
    hits, particles, truth, cells = all_data
    hits = hits.assign(evtid=evtid)

    px = particles.px
    py = particles.py
    pt = np.sqrt(px**2 + py**2)
    particles = particles.assign(pt=pt)

    if info:
        print("# of hits: ", hits.shape[0])
        print("# of particles: ", particles.shape[0])

    return hits, particles, truth, cells
def run_dbscan():
    data_dir = '../input/train_1'

    event_ids = ['000001000']
    sum = 0
    sum_score = 0
    for i, event_id in enumerate(event_ids):
        hits, cells, particles, truth = load_event(data_dir + '/event' +
                                                   event_id)
        labels = do_dbscan_predict(hits)
        submission = create_one_event_submission(0, hits['hit_id'].values,
                                                 labels)
        score = score_event(truth, submission)
        print('[%2d] score : %0.8f' % (i, score))
        sum_score += score
        sum += 1

    print('--------------------------------------')
    print(sum_score / sum)
Exemple #28
0
    def __init__(
        self,
        event_file,
        detector_file,
        detector=None,
        use_null_module=True,
        import_track_hits=True
    ):  # Null module is an empty module to indicate no detector hit.
        self.hits, self.cells, self.particles, self.truth = load_event(
            event_file)
        if detector is None:
            self.detector = Detector(detector_file)
        else:
            self.detector = detector
        self.use_null_module = use_null_module
        self.track_hit_dict = pickle.load(
            open(event_file + "-trackdict.p", "rb"))

        self.init_detector_geometry()
Exemple #29
0
def load_training():
    if os.path.exists(data_name):
        with open(data_name, 'rb') as fp:
            training_df = pickle.load(fp)
    else:
        det = pd.read_csv('input/detectors.csv')
        hits_pd = make_uID(det)

        training_df = []
        train = np.unique([p.split('-')[0] for p in sorted(glob.glob('input/train_1/**'))])
        for ds_name in train:
            hits, truth = load_event(ds_name, parts=['hits', 'truth'])
            hits_with_uID = pd.merge(hits, hits_pd, on=['volume_id', 'layer_id', 'module_id'])
            filtered_truth = truth[ (truth['weight'] > 5e-7 ) & (truth['particle_id'] != 0) ]
            training = hits_with_uID.merge(filtered_truth, on='hit_id')[['uID', 'particle_id']]
            unique_truth = pd.Series(np.unique(training['particle_id']))
            training_df.append( (training, unique_truth) )

        with open(data_name, 'wb') as fp:
            pickle.dump(training_df, fp)
    return training_df
Exemple #30
0
    def get_score_of_one_event(self, ievt):
        event_str = "event00000{0}".format(1000+ievt)
        logging.debug("processing {} event:".format(event_str))

        # get score of the track candidates
        # load the event info from tracking ml
        event_input_name = os.path.join(self.trackdata_dir, event_str)
        hits, truth = load_event(event_input_name, parts=['hits', 'truth'])

        all_gnn_tracks = []
        all_true_tracks = []
        for i in range(self.n_sections):
            res_tracks = self.get_tracks_of_one_sector(event_str, i, hits, truth)
            all_gnn_tracks += res_tracks[0]
            all_true_tracks += res_tracks[1]

        event_gnn   = score_tracks(all_gnn_tracks,  hits, truth)
        event_truth = score_tracks(all_true_tracks, hits, truth)
        logging.debug("SCORE of {} event: {:.4f} {:.4f} {:.4f}, {:.4f}\n".format(
                      ievt, event_gnn[0], event_truth[0], event_gnn[0]/event_truth[0], event_truth[1])
        )
        return [event_gnn[0], event_truth[0]]