Ejemplo n.º 1
0
def retain_relevant_fields(data):
    #TODO read these from a conf file
    aggregate_fields = {}
    aggregate_fields['aggr0_6'] = data['ika0'] + data['ika1'] + data[
        'ika2'] + data['ika3'] + data['ika4'] + data['ika5'] + data['ika6']
    aggregate_fields['aggr7_12'] = data['ika7'] + data['ika8'] + data[
        'ika9'] + data['ika10'] + data['ika11'] + data['ika12']
    aggregate_fields['aggr13_17'] = data['ika13'] + data['ika14'] + data[
        'ika15'] + data['ika16'] + data['ika17']
    aggregate_fields['aggr18_29'] = data['ika18'] + data['ika19'] + data[
        'ika20'] + data['ika21'] + data['ika22'] + data['ika23'] + data[
            'ika24'] + data['ika25_29']
    aggregate_fields['aggr30_64'] = data['ika30_34'] + data['ika35_39'] + data[
        'ika40_44'] + data['ika45_49'] + data['ika50_54'] + data[
            'ika55_59'] + data['ika60_64']
    aggregate_fields['aggr64_'] = data['ika65_69'] + data['ika70_74'] + data[
        'ika75_79'] + data['ika80_84'] + data['ika85_89'] + data[
            'ika90_94'] + data['ika95_']

    #add the fields to data rec array
    augmented_data = rf.rec_append_fields(data, aggregate_fields.keys(),
                                          aggregate_fields.values())
    #...and add these fields later. they are here for their column names, but the line above would cause an exception if we added them before
    aggregate_fields['asyht'] = data['asyht']
    aggregate_fields['ruots'] = data['ruots']
    aggregate_fields['ekoord'] = data['ekoord']
    aggregate_fields['nkoord'] = data['nkoord']

    # drop all fields whose names are not in aggregate_fields
    fields2drop = [
        d for d in data.dtype.names if d not in aggregate_fields.keys()
    ]
    return rf.rec_drop_fields(augmented_data, fields2drop)
Ejemplo n.º 2
0
def move_bad_fields_to_bottom(oldArray, orderedFieldList, orderedTypeList):
    """
    Move the given fields in a structured array to the bottom and change their type
    
    Input
    -----                                                           
    oldArray : numpy structured array
        previous array to modify               
    orderFieldList : list
        list of fields to move and change type 
    orderedTypeList : list
        list of new types for the fields
                           
    Returns an array with some fields moved to the bottom and with a different type
    """

    outArray = oldArray.copy()
    for name, typ in zip(orderedFieldList, orderedTypeList):
        #Remove field of interest from the array
        tmpArray = rec.rec_drop_fields(outArray, name)

        #Append the same field at the end of the array with the right data type
        outArray = rec.rec_append_fields(tmpArray,
                                         name,
                                         oldArray[name].copy(),
                                         dtypes=typ)
    return outArray
Ejemplo n.º 3
0
def add_time_column(table, name='time', pop_start=True, pop_offset=True):
    """Append a column named 'time' by combining the gps_start and _offset

    Parameters
    ----------
    table : `EventTable`
        table of events to modify
    name : `str`, optional
        name of field to append, default: 'time'
    pop_start: `bool`, optional
        remove the 'gps_start' field when finished, default: `True`
    pop_offset: `bool`, optional
        remove the 'gps_offset' field when finished, default: `True`

    Returns
    -------
    mod : `recarray`, matches type of input
        a modified version of the input table with the new time field
    """
    type_ = type(table)
    t = table['gps_start'] + table['gps_offset']
    drop = []
    if pop_start:
        drop.append('gps_start')
    if pop_offset:
        drop.append('gps_offset')
    if drop:
        table = recfunctions.rec_drop_fields(table, drop)
    return recfunctions.rec_append_fields(table, [name], [t]).view(type_)
Ejemplo n.º 4
0
def add_time_column(table, name='time', pop_start=True, pop_offset=True):
    """Append a column named 'time' by combining the gps_start and _offset

    Parameters
    ----------
    table : `EventTable`
        table of events to modify
    name : `str`, optional
        name of field to append, default: 'time'
    pop_start: `bool`, optional
        remove the 'gps_start' field when finished, default: `True`
    pop_offset: `bool`, optional
        remove the 'gps_offset' field when finished, default: `True`

    Returns
    -------
    mod : `recarray`, matches type of input
        a modified version of the input table with the new time field
    """
    type_ = type(table)
    t = table['gps_start'] + table['gps_offset']
    drop = []
    if pop_start:
        drop.append('gps_start')
    if pop_offset:
        drop.append('gps_offset')
    if drop:
        table = recfunctions.rec_drop_fields(table, drop)
    return recfunctions.rec_append_fields(table, [name], [t]).view(type_)
Ejemplo n.º 5
0
 def delete_field(self, name):
     """Delete field with name."""
     if name not in self.dtype.names:
         raise ValueError(
             'Currently, can only delete single names from {}.'.format(
                 self.dtype.names))
     new_array = rec_drop_fields(self, name)
     new = BoundStructArray(new_array,
                            self.index_key,
                            self._is_attr_of,
                            keys_multicol=self._keys_multicol)
     setattr(self._is_attr_of[0], self._is_attr_of[1], new)
Ejemplo n.º 6
0
def _drop_object_col(rec, warn=True):
    # ignore columns of type `object` since PyTables does not support these
    if rec.dtype.hasobject:
        object_fields = []
        fields = rec.dtype.fields
        for name in rec.dtype.names:
            if fields[name][0].kind == "O":
                object_fields.append(name)
                if warn:
                    log.warning("ignoring unsupported object branch '{0}'".format(name))
        # NumPy 1.7.1: TypeError: Cannot change data-type for object array.
        # return rec[non_object_fields]
        if object_fields:
            rec = recfunctions.rec_drop_fields(rec, object_fields)
    return rec
Ejemplo n.º 7
0
def _drop_object_col(rec, warn=True):
    # ignore columns of type `object` since PyTables does not support these
    if rec.dtype.hasobject:
        object_fields = []
        fields = rec.dtype.fields
        for name in rec.dtype.names:
            if fields[name][0].kind == 'O':
                object_fields.append(name)
                if warn:
                    log.warning(
                        "ignoring unsupported object branch '{0}'".format(
                            name))
        # NumPy 1.7.1: TypeError: Cannot change data-type for object array.
        #return rec[non_object_fields]
        if object_fields:
            rec = recfunctions.rec_drop_fields(rec, object_fields)
    return rec
Ejemplo n.º 8
0
    def _from_hdf_dataset(cls, dataset) -> "DropletTrack":
        """construct a droplet track by reading data from an hdf5 dataset

        Args:
            dataset:
                an HDF5 dataset from which the data of the droplet track is read
        """
        # there are values, so the emulsion is not empty
        droplet_class = dataset.attrs["droplet_class"]
        obj = cls()
        if droplet_class == "None":
            return obj
        else:
            # separate time from the data set
            times = dataset["time"]
            droplet_data = rfn.rec_drop_fields(dataset, "time")
            for time, data in zip(times, droplet_data):
                droplet = droplet_from_data(droplet_class, data)
                obj.append(droplet, time=time)  # type: ignore

        return obj
Ejemplo n.º 9
0
    print target

    '''
    for i in range(len(data)):
        if data[target][i] > (data[target].mean() + 2*data[target].std()) or data[target][i] < (data[target].mean() - 2*data[target].std()):
            delList = np.append(delList, i)
            print (data[target].mean() - 1*data[target].std()), data[target].std()
    '''

    #clf = linear.BayesianRidge(verbose=True, alpha_1=2, alpha_2=2, lambda_1=.01, lambda_2=.01, fit_intercept=True, compute_score=True)
    #clf = linear.BayesianRidge(verbose=True)
    #clf = tree.DecisionTreeRegressor(max_depth=2)
    clf = svm.SVR(C=10000.0, kernel='rbf', degree=1)
    data = np.delete(data, delList, 0)
    data, testa, features, fillVal = util.prepDataTrain(data, target, featuresList, False, 20, False, True, 'mean', False, 'set')
    data = recfunctions.rec_drop_fields(data, delFeatures)
    #features = ['CTI','Depth', 'RELI', 'LSTN']
    #an.plotData(np.sqrt(1+data['P']), data['ELEV']*(-1*data['TMAP']))
    #data, clust, enc, newCol = clusterData(data, clusterFields, True)
    #testa, clust, enc, newCol = clusterData(testa, pickTest, True, enc, clust, False)
    #features = np.concatenate((features, newCol))
    
    #Use/tune your predictor
    #clf.fit(data[features].tolist(), data[target])
    #import pydot
    #dot_data = StringIO.StringIO() 
    #tree.export_graphviz(clf, out_file=dot_data) 
    #graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    
    #graph.write_pdf("./ds.pdf")
Ejemplo n.º 10
0
Archivo: mc.py Proyecto: sagittaeri/htt
    def records(self,
                category=None,
                region=None,
                fields=None,
                cuts=None,
                include_weight=True,
                systematic='NOMINAL',
                scale=1.,
                return_idx=False,
                **kwargs):

        from .ztautau import Ztautau

        if include_weight and fields is not None:
            if 'weight' not in fields:
                fields = list(fields) + ['weight']

        selection = self.cuts(category, region, systematic) & cuts
        table_selection = selection.where()

        if systematic == 'NOMINAL':
            log.info("requesting table from %s" %
                     (self.__class__.__name__))
        else:
            log.info("requesting table from %s for systematic %s " %
                     (self.__class__.__name__, systematic_name(systematic)))
        log.debug("using selection: %s" % selection)

        # TODO: handle cuts in weight expressions
        weight_branches = self.get_weight_branches(systematic, no_cuts=True)
        if systematic in SYSTEMATICS_BY_WEIGHT:
            systematic = 'NOMINAL'

        recs = []
        if return_idx:
            idxs = []
        for ds, _, sys_tables, sys_events, xs, kfact, effic in self.datasets:

            try:
                table = sys_tables[systematic]
                events = sys_events[systematic]
            except KeyError:
                log.debug(
                    "table for %s not present for %s "
                    "using NOMINAL" % (systematic, ds.name))
                table = sys_tables['NOMINAL']
                events = sys_events['NOMINAL']

            actual_scale = self.scale
            if isinstance(self, Ztautau):
                if systematic == ('ZFIT_UP',):
                    log.debug("scaling up for ZFIT_UP")
                    actual_scale += self.scale_error
                elif systematic == ('ZFIT_DOWN',):
                    log.debug("scaling down for ZFIT_DOWN")
                    actual_scale -= self.scale_error

            weight = (
                scale * actual_scale *
                LUMI[self.year] *
                xs * kfact * effic / events)

            # read the table with a selection
            try:
                if table_selection:
                    rec = table.read_where(table_selection, **kwargs)
                else:
                    rec = table.read(**kwargs)
            except Exception as e:
                print table
                print e
                continue
                #raise

            if return_idx:
                # only valid if table_selection is non-empty
                idx = table.get_where_list(table_selection, **kwargs)
                idxs.append(idx)

            # add weight field
            if include_weight:
                weights = np.empty(rec.shape[0], dtype='f8')
                weights.fill(weight)
                # merge the weight fields
                weights *= reduce(np.multiply,
                    [rec[br] for br in weight_branches])
                # drop other weight fields
                rec = recfunctions.rec_drop_fields(rec, weight_branches)
                # add the combined weight
                rec = recfunctions.rec_append_fields(rec,
                    names='weight',
                    data=weights,
                    dtypes='f8')
                if rec['weight'].shape[0] > 1 and rec['weight'].sum() == 0:
                    log.warning("{0}: weights sum to zero!".format(table.name))

            if fields is not None:
                try:
                    rec = rec[fields]
                except Exception as e:
                    print table
                    print rec.shape
                    print rec.dtype
                    print e
                    raise
            recs.append(rec)

        if return_idx:
            return zip(recs, idxs)
        return recs
Ejemplo n.º 11
0
def read_file_prune_fields_clean_values(infile_name, x_name, y_name):
    data = np.recfromcsv(infile_name, delimiter=',')
    data = retain_relevant_fields(data)
    data = data[data[y_name] != -1]  #this takes care of the garbage rows
    return data[y_name], data[x_name], rf.rec_drop_fields(
        data, [y_name, x_name])
Ejemplo n.º 12
0
def get_triggers(channel, etg, segments, cache=None, snr=None, frange=None,
                 columns=None, raw=False, **kwargs):
    """Get triggers for the given channel
    """
    # get table from etg
    try:
        Table = TABLE[etg.lower()]
    except KeyError as e:
        e.args = ('Unknown ETG %r, cannot map to LIGO_LW Table class' % etg,)
        raise
    tablename = strip_table_name(Table.tableName)
    # get default columns for this table
    if columns is None:
        for key in COLUMNS:
            if issubclass(Table, key):
                columns = COLUMNS[key][:]
                break
    if 'channel' in columns:
        columns.pop('channel')

    # find triggers
    if cache is None:
        cache = find_trigger_files(channel, etg, segments, **kwargs)

    # read cache
    trigs = lsctables.New(Table, columns=columns)
    cache = cache.unique()
    cache.sort(key=lambda x: x.segment[0])
    for segment in segments:
        if len(cache.sieve(segment=segment)):
            if tablename.endswith('_inspiral'):
                filt = lambda t: float(t.get_end()) in segment
            else:
                filt = lambda t: float(t.get_peak()) in segment
            trigs.extend(Table.read(cache.sieve(segment=segment), filt=filt))

    # format table as numpy.recarray
    recarray = trigs.to_recarray(columns=columns)

    # filter
    if snr is not None:
        recarray = recarray[recarray['snr'] >= snr]
    if tablename.endswith('_burst') and frange is not None:
        recarray = recarray[
            (recarray['peak_frequency'] >= frange[0]) &
            (recarray['peak_frequency'] < frange[1])]

    # return basic table if 'raw'
    if raw:
        return recarray

    # otherwise spend the rest of this function converting functions to
    # something useful for the hveto core analysis
    addfields = {}
    dropfields = []

    # append channel to all events
    columns.append('channel')
    addfields['channel'] = numpy.repeat(channel, recarray.shape[0])

    # rename frequency column
    if tablename.endswith('_burst'):
        recarray = recfunctions.rename_fields(
            recarray, {'peak_frequency': 'frequency'})
        idx = columns.index('peak_frequency')
        columns.pop(idx)
        columns.insert(idx, 'frequency')

    # map time to its own column
    if tablename.endswith('_inspiral'):
        tcols = ['end_time', 'end_time_ns']
    elif tablename.endswith('_burst'):
        tcols = ['peak_time', 'peak_time_ns']
    else:
        tcols = None
    if tcols:
        times = recarray[tcols[0]] + recarray[tcols[1]] * 1e-9
        addfields['time'] = times
        dropfields.extend(tcols)
        columns = ['time'] + columns[2:]

    # add and remove fields as required
    if addfields:
        names, data = zip(*addfields.items())
        recarray = recfunctions.rec_append_fields(recarray, names, data)
        recarray = recfunctions.rec_drop_fields(recarray, dropfields)

    return recarray[columns]
Ejemplo n.º 13
0
 def drop_column(self, key):
     self.d = npr.rec_drop_fields(self.d, key)
     del self.scale[key]
     del self.unit[key]
Ejemplo n.º 14
0
    def filter_events(
        self,
        img_heights_px,
        drop_events_on_top=False,
        drop_events_on_bottom=False,
        drop_longer_and_shorter=False,
        drop_positive=False,
        drop_negative=False,
        force_keep={},
        force_drop={},
        force_position={},
        obsolete_regions={},
    ):
        """
        Filter out wrongly detected events:

        - events shorted than 0.9 or longer than 1.1 of the median event length
        - events starting on the image top (partial events)
        - events ending on the image bottom (partial events)
        - events in override_bad

        Fix events starts according to override_start.

        :param img_heights_px: image height for all cameras, {cam: height_px, ... }
        :param drop_events_on_top: drop events starting on the top (first row or first row after obsolete region)
        :param drop_events_on_bottom: drop events ending at the bottom (last row or last row before obsolete region)
        :param drop_longer_and_shorter: drop events of nonstandard length, apply only to events that are not split
        :param force_keep: force events to NOT BE filtered, specify events by a record array with frame, position
                        and positivity combination, e.g. {cam: [(frame, positive), (frame, positive)...], cam: ... }
        :param force_drop: force events to BE filtered, {cam: [(frame, position_px), (frame, position_px), ...], cam: ... }
        :param force_position: override event position, {cam: [(frame, horizontal position in px), ...], cam: ... }
        :param obsolete_regions: ignored stripes on the top and/or image bottom,
                                 {cam: {'top': top_px, 'bottom': bot_px}, ...}
        """

        # compute median event length in px for the cameras with the same img height
        heights_px = set(img_heights_px.values())
        median_event_length_px = {}
        mask_events_not_split = {}
        events = {}
        for cam in self.events.keys():
            cam_events = self.events[cam]
            override_bad_mask = self.__queries2mask__(
                cam_events, force_drop[cam] if cam in force_drop else None)
            override_good_mask = self.__queries2mask__(
                cam_events, force_keep[cam] if cam in force_keep else None)

            # filter out events
            mask_bad = np.zeros(len(cam_events), dtype=bool)
            if drop_events_on_top:
                mask_bad |= (cam_events["position_px"] <= obsolete_regions[cam]
                             ["top"]) & cam_events["positive"]
            if drop_events_on_bottom:
                mask_bad |= (cam_events["position_px"] >=
                             img_heights_px[cam] - 1) & ~cam_events["positive"]
            if drop_positive:
                mask_bad |= cam_events["positive"]
            if drop_negative:
                mask_bad |= ~cam_events["positive"]

            # if drop_longer_and_shorter:
            #     # apply filter only to the events that are not split (naturally shortened)
            #     event_length_px = median_event_length_px[img_heights_px[cam]]
            #     mask_bad |= mask_events_not_split[cam] & \
            #                 (((cam_events['end'] - cam_events['position_px']) < event_length_px * 0.9) |
            #                 ((cam_events['end'] - cam_events['position_px']) > event_length_px * 1.1))

            mask_bad |= override_bad_mask

            # force events to stay
            events[cam] = cam_events[~mask_bad | override_good_mask]
            # override event position
            if force_position and cam in force_position:
                for row in force_position[cam]:
                    query = rec_drop_fields(force_position[cam],
                                            ["position_px"])
                    idxs = np.nonzero(self.__queries2mask__(
                        events[cam], query))[0]
                    if len(idxs) == 0:
                        logging.warning("force_position can"
                                        "t find a matching event: %s" %
                                        str(query))
                    elif len(idxs) > 1:
                        logging.warning(
                            "force_position ambiguous match for query: %s" %
                            str(query))
                    else:
                        events[cam][
                            idxs[0]]["position_px"] = row["position_px"]

        self.events = events
Ejemplo n.º 15
0
    def __setitem__(self, keys, values):
        """Either a single one- or multi-column or mulitiple one-colum items."""
        if isinstance(values, pd.Series):
            values = values.values
            if values.dtype.char == 'O' or 'int' in values.dtype.name:
                values = values.astype(str)

        names_to_remove = []
        if isinstance(keys, str):
            # TODO: check that no-one accidentally overwrites the index?
            # quite unlikely though as self.index_key is not common
            # if keys == self.index_key:
            #     raise ValueError('The key {} is reserved for the index in BoundStructArray. '
            #                      .format(self.index_key))
            keys = [keys]
            # check if values is nested, if not, it's not multicolumn
            if (not hasattr(values[0], '__len__') or len(
                    values[0]
            ) == 1  # seems that we do not need this, as the previous line matches already
                    or np.array(values[0]).dtype.char
                    in {'S', 'U'}):  # a string is passed
                values = [values]
            else:  # otherwise it's a multicolumn key
                key_multicol = keys[0]
                if keys[0] not in self._keys_multicol:
                    self._keys_multicol += [key_multicol]
                    self._keys += [key_multicol]
                # generate single-column keys
                keys = _gen_keys_from_key_multicol(key_multicol,
                                                   len(values[0]))
                self._keys_multicol_lookup[key_multicol] = keys
                # remove all fields from the array that are not among keys
                keys_set = set(keys)
                for name in self.dtype.names:
                    if name.startswith(key_multicol) and name not in keys_set:
                        names_to_remove.append(name)
                values = np.array(values)
                if values.shape[0] == self.shape[0]:
                    values = values.T
                else:
                    raise ValueError(
                        'You provided an array with {} rows but it need'
                        'to have {}.'.format(values.shape[0], self.shape[0]))
        else:
            values = np.array(
                values)  # sequence of arrays or matrix with n_keys *rows*
            if values.shape[0] == self.shape[0]:
                values = values.T
            else:
                raise ValueError(
                    'You provided an array with {} rows but it need'
                    'to have {}.'.format(values.shape[0], self.shape[0]))
        keys = np.array(keys)
        values = np.array(
            values)  # sequence of arrays or matrix with n_keys *rows*
        # update keys
        for key in keys:
            if (key != self.index_key and key not in self._keys
                    and _key_belongs_to_which_key_multicol(
                        key, self._keys_multicol) < 0):
                self._keys += [key]

        if len(keys) != len(values):
            print(keys, values)
            raise ValueError(
                'You passed {} column keys but {} arrays as columns. '
                'If you passed a matrix instead of a sequence '
                'of arrays, try transposing it.'.format(
                    len(keys), len(values)))

        if values.shape[1] != self.shape[0]:
            raise ValueError('You want to add a column with {} rows '
                             'but it need to have {} rows.'.format(
                                 values.shape[1], self.shape[0]))

        if values.dtype.char in {'U', 'S'}:
            try:
                itemsize = values.dtype.itemsize
                if values.dtype.char == 'U': itemsize /= 4
                if itemsize > np.dtype(STRING_TYPE).itemsize:
                    logg.m('WARNING: truncating strings to length {}'.format(
                        np.dtype(STRING_TYPE).itemsize))
                values = values.astype(STRING_TYPE)
            except UnicodeEncodeError:
                raise ValueError(
                    'Currently only support ascii strings. Don\'t use "ö" etc. for sample annotation.'
                )

        present = np.intersect1d(keys, self.dtype.names)
        absent = np.setdiff1d(keys, self.dtype.names)

        if any(present):
            for k, v in zip(present, values[np.in1d(keys, present)]):
                if (v.dtype != self.dtype[k]
                        and v.dtype.itemsize > self.dtype[k].itemsize):
                    # TODO: need to reallocate memory
                    # or allow storing objects, or use pd.dataframes
                    raise SetKeyError(k, v.dtype, self.dtype[k])
                super(BoundStructArray, self).__setitem__(k, v)

        if any(absent):
            if values.shape[1] > len(self):
                raise ValueError(
                    'New column has too many entries ({} > {})'.format(
                        values.shape[1], len(self)))
            source = append_fields(self,
                                   absent,
                                   values[np.in1d(keys, absent)],
                                   usemask=False,
                                   asrecarray=True)
            if names_to_remove:
                source = rec_drop_fields(source, names_to_remove)
            new = BoundStructArray(source,
                                   self.index_key,
                                   self._is_attr_of,
                                   keys_multicol=self._keys_multicol)
            setattr(self._is_attr_of[0], self._is_attr_of[1], new)