Beispiel #1
0
def _regress_chrom(chrom_to_do):
    """Applies _regress_tfam() to all of the transcript families on a chromosome"""
    chrom_orfs = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r', where="chrom == %r and tstop > 0 and tcoord > 0" % chrom_to_do,
                             columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen', 'chrom', 'gcoord', 'gstop', 'strand',
                                      'codon', 'orftype', 'annot_start', 'annot_stop'])
    # tcoord > 0 removes ORFs where the first codon is an NTG, to avoid an indexing error
    # Those ORFs would never get called anyway since they couldn't possibly have any reads at their start codon

    if restrictbystartfilenames:
        restrictedstarts = pd.DataFrame()
        for (restrictbystart, minw) in zip(restrictbystartfilenames, opts.minwstart):
            restrictedstarts = restrictedstarts.append(
                pd.read_hdf(restrictbystart, 'start_strengths', mode='r', where="(chrom == %r) & (W_start > minw)" % chrom_to_do,
                            columns=['tfam', 'chrom', 'gcoord', 'strand']), ignore_index=True).drop_duplicates()
        chrom_orfs = chrom_orfs.merge(restrictedstarts)  # inner merge acts as a filter

    if chrom_orfs.empty:
        if opts.verbose > 1:
            logprint('No ORFs found on %s' % chrom_to_do)
        return failure_return

    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis))

    res = tuple([pd.concat(res_dfs) for res_dfs in zip(*[_regress_tfam(tfam_set, gnd) for (tfam, tfam_set) in chrom_orfs.groupby('tfam')])])

    for inbam in inbams:
        inbam.close()

    if opts.verbose > 1:
        logprint('%s complete' % chrom_to_do)

    return res
Beispiel #2
0
def create_subset(dest_store, dest_skims, maxZone, households_sample_size=0):

    dest_store_path = os.path.join(dest_data_dir, dest_store)
    dest_skims_path = os.path.join(dest_data_dir, dest_skims)

    print('land_use_taz')
    df = pd.read_hdf(source_store, 'land_use_taz')
    df = df[df.index <= maxZone]
    df.to_hdf(dest_store_path, 'land_use_taz')
    del df

    print('households')
    hh_df = pd.read_hdf(source_store, 'households')
    hh_df = hh_df[hh_df.TAZ <= maxZone]
    if households_sample_size:
        hh_df = hh_df.take(np.random.choice(len(hh_df), size=households_sample_size, replace=False))
    hh_df.to_hdf(dest_store_path, 'households')

    print('persons')
    per_df = pd.read_hdf(source_store, 'persons')
    per_df = per_df[per_df.household_id.isin(hh_df.index)]
    per_df.to_hdf(dest_store_path, 'persons')

    # process all skims
    skims = omx.open_file(source_skims)
    skims_out = omx.open_file(dest_skims_path, 'w')

    skimsToProcess = skims.list_matrices()
    for skimName in skimsToProcess:
        print(skimName)
        skims_out[skimName] = skims[skimName][0:maxZone, 0:maxZone]
        skims_out[skimName].attrs.TITLE = ''  # remove funny character for OMX viewer
Beispiel #3
0
def GetFiles():
    '''

    Get All the files with relevant tickers

    :return:
    '''
    flist = sorted(os.listdir('Z:/TAQ/TAQHDF5/'))

    for ff in flist:
        if ff.replace('taq_','')[:4]>='2001' and ff.replace('taq_','')[:4]<'2014':
            print "Downloading..."
            t0=datetime.datetime.now()
            #ff = 'taq_20131231.h5'
            path = "Z:/TAQ/TAQHDF5/" + ff
            df = pd.read_hdf(path,'Trades')
            ind = pd.read_hdf(path,'TradeIndex')
            ind['end'] = np.cumsum(ind['count'])
            symlist = 'AAPL AXP BA CAT CSCO CVX DD DIS GE GS HD IBM INTC JNJ JOM KO MCD MMM MRK MSFT NKE PFE PG TRV UNH UTX V VZ WMT XOM'.split(' ')
            ind['ticker'] = [str(j).strip() for j in ind['ticker']]
            ind = ind[ind['ticker'].isin(symlist)].reset_index(drop=True)
            ran = np.array([range(start,end) for start,end in zip(ind['start'],ind['end'])])
            ran = [item for sublist in ran for item in sublist]
            df = df[df.index.isin(ran)]
            df['time'] = pd.to_datetime(df['utcsec'],unit='s')
            for i in ind.index:
                start = int(ind.loc[i,'start'])
                end = int(ind.loc[i,'end'])
                df.loc[start:end,'sym'] = ind.loc[i,'ticker']
            df.to_csv('data/taq/' + ff.replace('taq_','').replace('.h5','')+'.csv',columns=['time','price','sym'],index=False)
            print datetime.datetime.now()-t0
    def from_analysis_file(data_set, analysis_file):
        dg = DriftingGratings(data_set)

        try:
            dg.populate_stimulus_table()

            dg._sweep_response = pd.read_hdf(analysis_file, "analysis/sweep_response_dg")
            dg._mean_sweep_response = pd.read_hdf(analysis_file, "analysis/mean_sweep_response_dg")
            dg._peak = pd.read_hdf(analysis_file, "analysis/peak")

            with h5py.File(analysis_file, "r") as f:
                dg._response = f["analysis/response_dg"].value
                dg._binned_dx_sp = f["analysis/binned_dx_sp"].value
                dg._binned_cells_sp = f["analysis/binned_cells_sp"].value
                dg._binned_dx_vis = f["analysis/binned_dx_vis"].value
                dg._binned_cells_vis = f["analysis/binned_cells_vis"].value
                if "analysis/noise_corr_dg" in f:
                    dg.noise_correlation = f["analysis/noise_corr_dg"].value
                if "analysis/signal_corr_dg" in f:
                    dg.signal_correlation = f["analysis/signal_corr_dg"].value
                if "analysis/rep_similarity_dg" in f:
                    dg.representational_similarity = f["analysis/rep_similarity_dg"].value

        except Exception as e:
            raise MissingStimulusException(e.args)

        return dg
Beispiel #5
0
 def merge_temp_databases(id_obs,store,file):
   store.append('events',pd.read_hdf(os.path.join(PATH.TMP_FOLDER,file),'events'),data_columns=['Pulse','SAP','BEAM','DM','Time'])
   meta_data = pd.read_hdf(os.path.join(PATH.TMP_FOLDER,file),'meta_data')
   meta_data.reset_index(inplace=True,drop=True)
   meta_data['version'] = args.vers
   store.append('meta_data',meta_data)
   os.remove(os.path.join(PATH.TMP_FOLDER,file))
def read_test_train(train_size):
    print("Load train.csv")
    train = pd.read_hdf("../modified_data/train_original.csv.hdf", 'table')
    null_count = train.isnull().sum().sum()
    if null_count > 0:
        print('Nans:', null_count)
        cols = train.isnull().any(axis=0)
        print(cols[cols == True])
        rows = train.isnull().any(axis=1)
        print(rows[rows == True])
        print('NANs in train, please check it!')
        exit()
    split = round((1-train_size)*len(train.index))
    train = train[split:]
    print("Load test.csv")
    test = pd.read_hdf("../modified_data/test.hdf", 'table')
    null_count = test.isnull().sum().sum()
    if null_count > 0:
        print('Nans:', null_count)
        cols = test.isnull().any(axis=0)
        print(cols[cols == True])
        print('NANs in test, please check it!')
        exit()
    features = get_features(train, test)
    return train, test, features
Beispiel #7
0
	def __init__(self, name='unnamed', description=''):
		self._datafile = '%s/Genes.h5' % dataDirectory()
		self._dataframe = pandas.read_hdf(self._datafile, 'data')
		self._metadata = pandas.read_hdf(self._datafile, 'metadata')

		self.name = name
		self.description = description
Beispiel #8
0
def test_to_hdf():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)

    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    with tmpfile('h5') as fn:
        a.x.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_series_equal(df.x, out[:])

    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    # test compute = False
    with tmpfile('h5') as fn:
        r = a.to_hdf(fn, '/data', compute=False)
        r.compute()
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])
Beispiel #9
0
def bbq_sauce_piquant():
    with recipes_db:
        cur = recipes_db.cursor()
        cur.execute("SELECT * FROM Recipe_IDs")
        query_results = cur.fetchall()
    for result in query_results:
        if 'bbq' in result[1].lower():
            r_id = result[0]
            print r_id, result[1]
            break
    # grab ingredients for this recipe
    cur.execute("SELECT Ingredient FROM Ingredient_List WHERE ID = " + str(r_id))
    query_results = cur.fetchall()
    ingredients = []
    for row in query_results:
        ingredients.append(row[0])

    # find recommended ingredients
    flavors = pd.read_hdf(data_dir + 'flavor_profiles_nnls.h5', 'df')
    piquant_ingredients = flavors['type'] == 'piquant'
    graph = pd.read_hdf(data_dir + 'ingredient_graph.h5', 'df')
    # get distances between ingredients in recipe and potential additions
    graph = graph.ix[piquant_ingredients][ingredients]
    graph_distances = graph.sum(axis=1)
    recommended_ingredients = list(np.sort(graph_distances)[::-1][:5].index)

    print 'Recipe ingredients:'
    print ingredients
    print ''
    print 'Recommended Piquant ingredients:'
    print recommended_ingredients
Beispiel #10
0
    def get_dt_deep(self, compute=False):
        filename = os.path.join(self.datadir, 'dt_deep{}.h5'.format(self.kwarg_tag))

        compute = not os.path.exists(filename)

        if not compute:
            try:
                dt_deep = pd.read_hdf(filename, 'dt_deep')
            except Exception:
                compute = True

        if compute:
            # need grid to work with first
            df = self.get_df()

            # Make bucket for derivative to go in
            df['dt_deep'] = np.nan

            # Compute derivative for each (feh, age) isochrone, and fill in
            for f, m in tqdm(itertools.product(*df.index.levels[:2]),
                             total=len(list(itertools.product(*df.index.levels[:2]))),
                             desc='Computing dt/deep'):
                subdf = df.loc[f, m]
                log_age = np.log10(subdf['star_age'])
                deriv = np.gradient(log_age, subdf['eep'])
                subdf.loc[:, 'dt_deep'] = deriv

            df.dt_deep.to_hdf(filename, 'dt_deep')
            dt_deep = pd.read_hdf(filename, 'dt_deep')

        return dt_deep
Beispiel #11
0
 def get_dtypes(self, where=None):
     if not where:
         filename=self.get_store_filename("dtypes")
         print filename
         return read_hdf(filename,self.get_store_key(),)
     else:
         return read_hdf(self.get_store_filename("dtypes"),self.get_store_key(),where=where)
def main(fname, blpath, odir, year, month):
    print "Applying EI Rules 1 and 2."
    
    hdf_filepath = odir + "/%s_%s_store_df.h5" % (year, month)
    print "LOOKING for HDF file at location ", hdf_filepath

    if os.path.exists(hdf_filepath):
        print "READING HDF"
        ei_df = pd.read_hdf(hdf_filepath, 'ei_df')
        bl_df = pd.read_hdf(hdf_filepath, 'bl_df')
    else:
        ei_df = pd.read_csv(fname, header=0, sep=";", converters=converters, names=cols, quotechar="'", decimal=",")
        ei_df, bl_df = bl_prepare(ei_df, blpath)
        print "Doing setup..."
        ei_df, bl_df = setup(ei_df, bl_df)
        print "SAVING HDF to", hdf_filepath
        ei_df.to_hdf(hdf_filepath, 'ei_df')
        bl_df.to_hdf(hdf_filepath, 'bl_df')

    print "Entering rule 1..."
    ei_df = rule1(ei_df, bl_df, RECEIVER)
    ei_df = rule1(ei_df, bl_df, SENDER)
    print "Entering rule 2..."
    ei_df = rule2(ei_df)
    print ei_df
    output_values = ["purchase_value", "remit_value", "transfer_value", "devolution_value", "icms_credit_value",  "remit_value", "tax", "icms_tax", "transportation_cost", "year", "month"]
    output_name = "%s_%s" % (year,month)
    print "Making tables..."
    ymsrp = make_table(ei_df, "srp", output_values, odir, output_name, year=year, month=month)
    def _evaluate_data(self, feature_set_name):
        x_train = pd.read_hdf(os.path.join(self._data_dir, feature_set_name, 'train_train_features.hf5'), 'data')
        y_train = pd.read_hdf(os.path.join(self._data_dir, 'train_train_y.hf5'), 'data')

        x_validation = pd.read_hdf(os.path.join(self._data_dir, feature_set_name, 'train_validation_features.hf5'), 'data')
        y_validation = pd.read_hdf(os.path.join(self._data_dir, 'train_validation_y.hf5'), 'data')

        dtrain = xgb.DMatrix(x_train, y_train['target'], missing=-1)
        dtest = xgb.DMatrix(x_validation, y_validation['target'], missing=-1)

        results = {}
        for i_seed in range(0, self.NB_SEEDS):
            evals_result = {}
            params = {'bst:max_depth': self._max_depth,
                      'bst:eta': self._eta,
                      'objective': 'binary:logistic',
                      'colsample_bytree': self._col_sample,
                      'subsample': self._sub_sample,
                      'min_child_weight': self._min_child_weight,
                      'eval_metric': 'auc',
                      'silent': 1,
                      'nthread': 16,
                      'seed': i_seed}

            eval_list = [(dtest, 'eval')]
            bst = xgb.train(params, dtrain, self._nb_rounds, eval_list, evals_result=evals_result, verbose_eval=False, early_stopping_rounds=200)
            results[i_seed] = evals_result['eval'][-1]

            save_model(bst, evals_result, params, x_train.columns.tolist(), feature_set_name, 'full_evaluation', os.path.join(self._data_dir, 'Models'))
            print 'Seed {} => {}'.format(i_seed, results[i_seed])

        return results
    def load_hdf(cls, filename, path=''):
        
        data = pd.read_hdf(filename, '{}/data'.format(path))
        t = np.array(data['t'])
        f = np.array(data['f'])
        mask = np.array(data['mask'])

        new = cls(t,f,mask=mask)

        acorr = pd.read_hdf(filename, '{}/acorr'.format(path))
        new._lag = np.array(acorr['lag'])
        new._ac = np.array(acorr['ac'])

        pgram = pd.read_hdf(filename, '{}/pgram'.format(path))
        new._pers = np.array(pgram['period'])
        new._pgram = np.array(pgram['pgram'])

        #store.close()

        i=1
        has_sub = True
        new.subseries = {}
        while has_sub:
            try:
                name = 'sub{}'.format(i)
                new.subseries[name] = cls.load_hdf(filename, path='{}/{}'.format(path,name))
            except KeyError:
                has_sub = False
            i += 1

        return new
    def from_analysis_file(data_set, analysis_file, stimulus):
        lsn = LocallySparseNoise(data_set, stimulus)

        lsn.populate_stimulus_table()

        if stimulus == stimulus_info.LOCALLY_SPARSE_NOISE:
            stimulus_suffix = stimulus_info.LOCALLY_SPARSE_NOISE_SHORT
        elif stimulus == stimulus_info.LOCALLY_SPARSE_NOISE_4DEG:
            stimulus_suffix = stimulus_info.LOCALLY_SPARSE_NOISE_4DEG_SHORT
        elif stimulus == stimulus_info.LOCALLY_SPARSE_NOISE_8DEG:
            stimulus_suffix = stimulus_info.LOCALLY_SPARSE_NOISE_8DEG_SHORT

        try:

            with h5py.File(analysis_file, "r") as f:
                k = "analysis/mean_response_%s" % stimulus_suffix
                if k in f:
                    lsn._mean_response = f[k].value

            lsn._sweep_response = pd.read_hdf(analysis_file, "analysis/sweep_response_%s" % stimulus_suffix)
            lsn._mean_sweep_response = pd.read_hdf(analysis_file, "analysis/mean_sweep_response_%s" % stimulus_suffix)

            with h5py.File(analysis_file, "r") as f:
                lsn._cell_index_receptive_field_analysis_data = LocallySparseNoise.read_cell_index_receptive_field_analysis(f, stimulus)

        except Exception as e:
            raise MissingStimulusException(e.args)

        return lsn
Beispiel #16
0
def main(opts, flgs):
    if not opts['training_hdf']:
        opts['training_hdf'] = opts['hdf']

    df = pnd.read_hdf(opts['hdf'], str(opts['data']))
    data = df[df['non_null_cells'] > int(opts['area_size'])]
    if opts['training_kchk'] and opts['training_ychk']:
        Kchk = pnd.read_hdf(opts['training_hdf'], str(opts['training_kchk']))
        ychk = pnd.read_hdf(opts['training_hdf'], str(opts['training_ychk']))
        itr = extract_itr(Kchk, ychk, int(opts['training_number']))
    else:
        with open(opts['training_json'], 'r') as fp:
            tr = json.load(fp)
            check_classification(tr)
            itr, Kchk, ychk = extract_training(tr, data,
                                               int(opts['training_number']))

    conf = imp.load_source("conf", opts['training_conf'])
    mls = getattr(conf, opts['training_mls'])
    key = None if opts['training_key'] == '' else opts['training_key']

    tdata, tKchk = transform(opts['transform'], data, Kchk)
    tK_chk, y_chk = tKchk.loc[itr], ychk.loc[itr]

    mls_classification(tdata, tK_chk, y_chk, mls,
                       hdf=opts['hdf'], out_class=opts['out_class'],
                       key=key)
    def from_analysis_file(data_set, analysis_file, movie_name):
        nm = NaturalMovie(data_set, movie_name)
        nm.populate_stimulus_table()

        # TODO: deal with this properly
        suffix_map = {
            stiminfo.NATURAL_MOVIE_ONE: '_'+stiminfo.NATURAL_MOVIE_ONE_SHORT,
            stiminfo.NATURAL_MOVIE_TWO: '_'+stiminfo.NATURAL_MOVIE_TWO_SHORT,
            stiminfo.NATURAL_MOVIE_THREE: '_'+stiminfo.NATURAL_MOVIE_THREE_SHORT
            }

        try:
            suffix = suffix_map[movie_name]


            nm._sweep_response = pd.read_hdf(analysis_file, "analysis/sweep_response"+suffix)
            nm._peak = pd.read_hdf(analysis_file, "analysis/peak")

            with h5py.File(analysis_file, "r") as f:
                nm._binned_dx_sp = f["analysis/binned_dx_sp"].value
                nm._binned_cells_sp = f["analysis/binned_cells_sp"].value
                nm._binned_dx_vis = f["analysis/binned_dx_vis"].value
                nm._binned_cells_vis = f["analysis/binned_cells_vis"].value
        except Exception as e:
            raise MissingStimulusException(e.args)

        return nm
    def from_analysis_file(data_set, analysis_file):
        ns = NaturalScenes(data_set)
        ns.populate_stimulus_table()

        try:
            ns._sweep_response = pd.read_hdf(analysis_file, "analysis/sweep_response_ns")
            ns._mean_sweep_response = pd.read_hdf(analysis_file, "analysis/mean_sweep_response_ns")
            ns._peak = pd.read_hdf(analysis_file, "analysis/peak")

            with h5py.File(analysis_file, "r") as f:
                ns._response = f["analysis/response_ns"].value
                ns._binned_dx_sp = f["analysis/binned_dx_sp"].value
                ns._binned_cells_sp = f["analysis/binned_cells_sp"].value
                ns._binned_dx_vis = f["analysis/binned_dx_vis"].value
                ns._binned_cells_vis = f["analysis/binned_cells_vis"].value

                if "analysis/noise_corr_ns" in f:
                    ns.noise_correlation = f["analysis/noise_corr_ns"].value
                if "analysis/signal_corr_ns" in f:
                    ns.signal_correlation = f["analysis/signal_corr_ns"].value
                if "analysis/rep_similarity_ns" in f:
                    ns.representational_similarity = f["analysis/rep_similarity_ns"].value

        except Exception as e:
            raise MissingStimulusException(e.args)

        return ns
Beispiel #19
0
def read_grid(hdf_fname):
    """
    Load the grid information from hdf

    Parameters
    ----------
    hdf_fname: str
        filename and path to the HDF file

    Returns
    -------
        wavelength : astropy.units.Quantity
        meta : pandas.Series
        index : pandas.DataFrame
        fluxes : astropy.units.Quantity

    """

    logger.info('Reading index')
    index = pd.read_hdf(hdf_fname, 'index')
    meta = pd.read_hdf(hdf_fname, 'meta')
    logger.info('Discovered columns {0}'.format(', '.join(meta['parameters'])))

    with h5py.File(hdf_fname) as fh:
        logger.info('Reading Fluxes')
        fluxes = fh['fluxes'].__array__()

    logger.info('Fluxes shape {0}'.format(fluxes.shape))
    flux_unit = u.Unit(meta['flux_unit'])
    wavelength = pd.read_hdf(hdf_fname, 'wavelength').values[:, 0]
    wavelength = u.Quantity(wavelength, meta['wavelength_unit'])

    return wavelength, meta, index, fluxes * flux_unit
Beispiel #20
0
def load_pascal(VOCyear='VOC2012', force=False, args=None):
    """
    Load all the annotations, including object bounding boxes.
    Loads XML data in args['num_workers'] threads using joblib.Parallel.

    Warning: this takes a few minutes to load from scratch!
    """
    if args is None:
        # TODO: set this to number of cores on machine
        args = {'num_workers': 8}

    cache_filename = \
        vislab.config['paths']['shared_data'] + \
        '/pascal_{}_dfs.h5'.format(VOCyear)
    if not force and os.path.exists(cache_filename):
        images_df = pd.read_hdf(cache_filename, 'images_df')
        objects_df = pd.read_hdf(cache_filename, 'objects_df')
        return images_df, objects_df

    # Load all annotation file data (should take < 30 s).
    annotation_filenames = glob.glob(
        vislab.config['paths'][VOCyear] + '/Annotations/*.xml')
    images_df, objects_df = load_annotation_files(
        annotation_filenames, args['num_workers'])

    # Get the split information.
    splits_dir = vislab.config['paths'][VOCyear] + '/ImageSets/Main'
    images_df['_split'] = None
    for split in ['train', 'val', 'test']:
        split_filename = splits_dir + '/{}.txt'.format(split)
        if not os.path.exists(split_filename):
            print("{} split does not exist".format(split))
            continue
        with open(split_filename) as f:
            inds = [x.strip() for x in f.readlines()]
        safe_inds = set(inds).intersection(images_df.index)
        images_df['_split'].ix[safe_inds] = split

    # Drop images without a split (VOC2007 images in the VOC2012 set).
    images_df = images_df.dropna(subset=['_split'])

    # Generate image filenames
    images_df['_filename'] = images_df.apply(
        lambda r: get_image_filename_for_id(r.name, VOCyear),
        axis=1)

    # Drop corresponding images in the objects_df.
    objects_df = objects_df.ix[images_df.index]

    # Propagate split info to objects_df
    objects_df['split'] = np.repeat(
        images_df['_split'].values, images_df['_num_objects'].values)

    # Make sure that all labels are either True or False.
    images_df = images_df.fillna(False)

    images_df.to_hdf(cache_filename, 'images_df', mode='w')
    objects_df.to_hdf(cache_filename, 'objects_df', mode='a')
    return images_df, objects_df
Beispiel #21
0
def merge(settings, overwrite=False):
    """
    Merges interviews over time by household.

    Parameters
    ----------
    settings : JSON settings file
    overwrite : bool
        whether to overwrite existing files

    Returns
    -------
    None (IO)
    """
    STORE_FMT = 'm%Y_%m'
    store_path = settings['monthly_store']
    start = settings['date_start']
    end = settings['date_end']
    all_months = pd.date_range(start=start, end=end, freq='m')

    if overwrite:
        logger.info("Merging for {}".format(all_months))
    else:
        with pd.get_store(settings['merged_store']) as store:
            cached = set(store.keys())
            all_m = set([x.strftime('/' + STORE_FMT) for x in all_months])
            logger.info("Using cached for {}".format(cached & all_m))
            new = all_m - cached
            all_months = filter(lambda x: x.strftime('/' + STORE_FMT) in new,
                                all_months)

    for m0 in all_months:
        months = (x.strftime('cpsm%Y-%m')
                  for x in m.make_months(m0.strftime('%Y-%m-%d')))
        months = enumerate(months, 1)

        mis, month = next(months)
        df0 = pd.read_hdf(store_path, key=month).query('HRMIS == @mis')
        match_funcs = [m.match_age, m.match_sex, m.match_race]
        dfs = [df0]
        for mis, month in months:
            try:
                dfn = pd.read_hdf(store_path, key=month).query('HRMIS == @mis')
                dfs.append(m.match(df0, dfn, match_funcs))
            except KeyError:
                msg = "The panel for {} has no monthly data file for {}"
                logger.warn(msg.format(m0, month))
                continue

        df = m.merge(dfs)
        df = df.sort_index()
        df = m.make_wave_id(df)

        store_key = df['wave_id'].iloc[0].strftime(STORE_FMT)
        df.to_hdf(settings["merged_store"], store_key)
        logger.info("Added merged {} to {}".format(store_key,
                                                   settings['merged_store']))
Beispiel #22
0
def import_data():
    def add_columns(df, df_i):
        df['ingredients_clean'] = ing_utils.get_ings_by_product(df, df_i)
        df['num_ingredients'] =  df['ingredients_clean'].apply(len)
        df['hier'] = df[['aisle', 'shelf', 'food_category']].values.tolist()
    df = pd.read_hdf('../foodessentials/products.h5', 'products')
    df_i = pd.read_hdf('../foodessentials/ingredients.h5', 'ingredients')
    add_columns(df, df_i)
    return df, df_i
def calcem(): #execute this function in EM calculation directory
    process=subprocess.check_output(['export_band.py','-wbe'])
    vb=pd.read_hdf('vb.h5','vb').as_matrix()[:5][:,1:3]
    cb=pd.read_hdf('cb.h5','cb').as_matrix()[:5][:,1:3]
    os.remove('vb.h5')
    os.remove('cb.h5')
    mh,err_mh=fit_em(vb)
    me,err_me=fit_em(cb)
    return me,mh,err_me,err_mh
Beispiel #24
0
def to_database(scenario=' ', rng=range(0, 0), urbansim_connection=get_connection_string("configs/dbconfig.yml", 'urbansim_database'),
                default_schema='urbansim_output'):
    """ df_name:
            Required parameter, is the name of the table that will be read from the H5 file,
            Also first half of the table name to be stored in the database
        urbansim_connection:
            sql connection, default is for urbansim_database
        year:
            year of information to be caputured, should be pass the same range as simulation period
            minus first and last year.
        defalut_schema:
            The schema name under which to save the data, default is urbansim_output
    """
    conn = psycopg2.connect(database="urbansim", user="******", password="******", host="socioeca8",
                            port="5432")
    cursor = conn.cursor()
    t = (scenario,)
    cursor.execute('SELECT scenario_id FROM urbansim_output.parent_scenario WHERE scenario_name=%s', t)
    scenario_id = cursor.fetchone()
    cursor.execute('SELECT parent_scenario_id FROM urbansim_output.parent_scenario WHERE scenario_name=%s', t)
    parent_scenario_id = cursor.fetchone()
    conn.close()

    for year in rng:
        if year == 0 and scenario_id[0] == 1:
            for x in ['parcels', 'buildings', 'jobs']:

                print 'exporting ' + x + str(year) + ' ' + str(scenario_id[0])

                df = pd.read_hdf('data\\results.h5', 'base/' + x)
                df['parent_scenario_id'] = parent_scenario_id[0]
                df.to_sql(x + '_base', urbansim_connection, schema=default_schema, if_exists='append')
        elif year == rng[len(rng)-1]:
            for x in ['buildings', 'feasibility', 'jobs']:
                print 'exporting ' + x + str(year) + ' ' + str(scenario_id[0])

                df = pd.read_hdf('data\\results.h5', str(year) + '/' + x)
                if x == 'feasibility':
                    df = df['residential']
                    df.rename(columns={'total_sqft': 'total_sqft_existing_bldgs'}, inplace=True)
                    df = df[(df.addl_units > 0) or (df.non_residential_sqft > 0)]
                    df['existing_units'] = np.where(df['new_built_units'] == 0, df['total_residential_units'], \
                                                    df['total_residential_units'] - df['addl_units'])

                elif x == 'buildings':
                    df = df[df.new_bldg == 1]
                    df.sch_dev = df.sch_dev.astype(int)
                    df.new_bldg = df.new_bldg.astype(int)

                elif x == 'jobs':
                    df = df[df.index > get_max_job_id()]
                df['year'] = year
                df['scenario_id'] = scenario_id[0]
                df['parent_scenario_id'] = parent_scenario_id[0]

                df.to_sql(x, urbansim_connection, schema=default_schema, if_exists='append')
Beispiel #25
0
def tick_data_convert_dates_single(TCKR, directory=None):
    """
    Input: single ticker in format 'TICKER.X', where X is netfonds exchange letter (N:NYSE,O:NASDAQ,A:AMEX)
    Combines all tickdata files for the ticker in the directory, default = current.
    """
    
    start_dir = os.getcwd() #save start dir so we can revert back at the end of program
    if directory==None:
        directory = start_dir
    
    os.chdir(directory)
    
    #get list of files for ticker = TCKR
    files = os.path.isfile(TCKR+'.combined.h5')
    if not files:
        print 'Error: '+  TCKR+'.combined.h5'  + '  not found'      
        return 1
    size1 = os.path.getsize(TCKR+'.combined.h5')
 
    df = pd.read_hdf(TCKR+'.combined.h5', 'dataframe')
    os.remove(TCKR+'.combined.h5')
    
#    if 'time'in df.columns.values:
#        df.index = pd.to_datetime(df['time'])
#        del df['time']
#        print TCKR + ' deleted time'
#    if 'daysecs' in df.columns.values:
#        del df['daysecs']
#        print TCKR + ' deleted daysecs'
#    if 'timeopen' in df.columns.values:
#        del df['timeopen']
#        print TCKR + ' deleted timeopen'
#    if 'timeclose' in df.columns.values:
#        del df['timeclose']
#        print TCKR + ' deleted timeclose'
#    if 'date' in df.columns.values:
#        del df['date']
#        print TCKR + ' deleted date'
#
#    df.index = pd.to_datetime(df.index)
#    print TCKR + ' converted index to timeseries'
    
    store = pd.HDFStore(TCKR+'.combined.h5')
    store.append('dataframe', df, format='table', complib='blosc', complevel=9, expectedrows=len(df))
    store.close()
    #df.to_hdf(TCKR+'.combined.h5', 'dataframe', mode='w',format='table',complib='blosc', complevel=9)
    size2 = os.path.getsize(TCKR+'.combined.h5')
    print TCKR + 'wrote to hdf file. size change=' +str(float(size2)/float(size1))
    
    df2=pd.read_hdf(TCKR+'.combined.h5', 'dataframe')
    (df2==df).all()
    if (df2.index==df.index).all():
        print TCKR + ' Indexes match!'
  
    os.chdir(start_dir)
    return 0
    def __init__(self, features_dir, train_features_filename='train_features.hf5', test_features_filename='test_features.hf5', train_y_filename='train_y.hf5'):
        self.features_dir = features_dir
        self._train_features_filename = train_features_filename
        self._test_features_filename = test_features_filename
        self._train_y_filename = train_y_filename

        # Load features
        self._train_features = pd.read_hdf(os.path.join(features_dir, train_features_filename), 'data')
        self._test_features = pd.read_hdf(os.path.join(features_dir, test_features_filename), 'data')
        self._train_y = pd.read_hdf(os.path.join(features_dir, train_y_filename), 'data')
Beispiel #27
0
def load_dataset(force=False):
    cache_filename = vislab.config['paths']['shared_data'] + '/inria_dfs.h5'
    if not force and os.path.exists(cache_filename):
        images_df = pd.read_hdf(cache_filename, 'images_df')
        objects_df = pd.read_hdf(cache_filename, 'objects_df')
        return images_df, objects_df

    objects_dfs = []
    images_dfs = []
    for split in ['Train', 'Test']:
        # Load object data.
        anno_filenames = [
            _.strip() for _
            in open('{}/{}/annotations.lst'.format(dirname, split)).readlines()
        ]
        objects_df = pd.concat((
            parse_annotation(anno_filename)
            for anno_filename in anno_filenames
        ))

        # Construct images_df from the objects data.
        grouped = objects_df.groupby(level=0)
        images_df = pd.DataFrame()
        images_df['filename'] = objects_df.groupby(level=0).first()['filename']
        images_df[['filename', 'width', 'height']] = grouped.first()[
            ['filename', 'width', 'height']]

        # We know that all objects are PASperson, but let's count them.
        images_df['PASperson'] = True
        images_df['num_objects'] = grouped.count()['class']

        # Load negative examples and append to the images_df.
        neg_filenames, neg_image_ids = map(list, zip(*[
            (_.strip(), _.strip().split('/')[-1][:-4]) for _
            in open('{}/{}/neg.lst'.format(dirname, split)).readlines()
        ]))
        neg_images_df = pd.DataFrame(index=neg_image_ids)
        neg_images_df['filename'] = neg_filenames
        neg_images_df['PASperson'] = False
        neg_images_df['num_objects'] = 0
        images_df = images_df.append(neg_images_df)

        objects_df['split'] = split
        images_df['split'] = split

        objects_dfs.append(objects_df)
        images_dfs.append(images_df)

    objects_df = pd.concat(objects_dfs)
    images_df = pd.concat(images_dfs)

    images_df.to_hdf(cache_filename, 'images_df', mode='w')
    objects_df.to_hdf(cache_filename, 'objects_df', mode='a')

    return images_df, objects_df
def restore_db():
    filenames = glob.glob('phd_store*.h5')
    data_df = pd.read_hdf(filenames[0], 'author_df')
    for filename in filenames[1:]:
        temp = pd.read_hdf(filename, 'author_df')
        data_df = data_df.append(temp, ignore_index=True)

    # make a dataframe that is just US astro PhDs
    astro_df = data_df[(data_df['nonUS'] == False) &
                       (data_df['astroPublication'] == True)]
    return astro_df, data_df
def load_DB():
  meta_data = pd.read_hdf(PATH.DB, 'meta_data')
  pulses = pd.read_hdf(PATH.DB, 'pulses')
  cands = pd.read_hdf(PATH.DB, 'candidates')
  cands = cands[cands.main_cand == 0]
  cands.sort_values('Sigma', inplace=True, ascending=False)
  cands = cands.groupby('BEAM').head(10)
  cands = cands.head(50)
  cands = cands[ ((cands.N_pulses == 1) & (cands.Sigma>10.)) | ((cands.N_pulses > 1) & (cands.Sigma>16.)) ]
  cands.sort_values('Sigma', inplace=True, ascending=False)
  return meta_data, pulses, cands
Beispiel #30
0
    def load_params(self):
        """
        """

        self.params_matrix = pd.read_hdf(self.ref_path, 'params_matrix')

        self.paramtree = pd.read_hdf(self.ref_path, 'params')
        self.paramtree = ParamTree(df=self.paramtree)

        self.measuretree = pd.read_hdf(self.ref_path, 'measures')
        self.measuretree = ParamTree(df=self.measuretree, adimentionalized=False)
Beispiel #31
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

dataloc = '/home/jacob/research/velas/vela2b/vela27/a0.490/'
dataloc = '/mnt/cluster/abs/cgm/vela2b/vela27/a0.490/'
boxfile = '{0:s}vela2b-27_GZa0.490.h5'.format(dataloc)

d = pd.read_hdf(boxfile, 'data')
loT, hiT = 10**4, 10**4.5
loN, hiN = 10**-5, 10**-4.5
print len(d)

cloudInds = ((d['temperature'] < hiT) & (d['temperature'] > loT) &
             (d['density'] < hiN) & (d['density'] > loN) & (d['x'] < 0) &
             (d['z'] > 0) & (np.abs(d['y']) < 300))

cloud = d[cloudInds]

loc = cloud[['x', 'y', 'z']]
locMat = loc.as_matrix()
datamean = loc.mean(axis=0).as_matrix()

uu, dd, vv = np.linalg.svd(locMat, full_matrices=True)

print uu
print dd
print vv
 def get_trial_response_df(self):
     tdf = pd.read_hdf(self.trial_response_df_path, key='df')
     tdf.reset_index(inplace=True)
     tdf.drop(columns=['cell_roi_id'], inplace=True)
     return tdf
Beispiel #33
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.set_printoptions(edgeitems=1000)
dat = (np.load('ssi63shitl10hr.npy'))
print(int(dat[0, -1]))
exit()

df = pd.read_hdf('../../flight-data/ssi63.h5')
[print(k) for k in df.keys()]
dfn = df[[
    'raw_pressure_1', 'raw_pressure_2', 'raw_pressure_3', 'raw_pressure_4',
    'raw_temp_1', 'raw_temp_2', 'raw_temp_3', 'raw_temp_4', 'lat_gps',
    'long_gps', 'altitude_gps', 'heading_gps', 'speed_gps', 'num_sats_gps'
]]
del df
dfn = dfn.iloc[:20 * 60 * 60 * 10]
dfn.reindex((dfn.index - dfn.index[0]) / np.timedelta64(1, 's'))
data = np.hstack(
    (((dfn.index - dfn.index[0]) / np.timedelta64(1, 's')).values.reshape(
        -1, 1), dfn.values))
np.save('ssi63shitl10hr', data)
def ColumnCleaner(filer,key):
    framer1 = pd.read_hdf(filer,key)
    framer1 = framer1.drop(['Run','Event','SubEvent','SubEventStream','exists'], axis=1)
    cols1 = framer1.columns.tolist()
    framer1.columns = [key+'_%s' % cols1[i] for i in range(0, len(cols1[:]))]
    return framer1
animalLists = [['adap042', 'adap043'], ['adap044', 'adap046']]
labels = ['low_freq_go_left', 'low_freq_go_right']
tuningIntensities = [40, 50, 60, 70]
plotAll = True

qualityThreshold = 3 #2
maxZThreshold = 2
ISIcutoff = 0.02

for label,animalList in zip(labels, animalLists):
    # -- Make composite celldb -- #
    allMiceDfs = []
    for animal in animalList:
        databaseFullPath = os.path.join(settings.DATABASE_PATH, '{}_database.h5'.format(animal))
        key = 'head_fixed'
        celldbThisMouse = pd.read_hdf(databaseFullPath, key=key)
        allMiceDfs.append(celldbThisMouse)
    celldb = pd.concat(allMiceDfs, ignore_index=True)


    # -- Plot histogram of responsive freqs by hemi (!only for those TTs that are in striatum!) -- #  
    goodQualCells = celldb.query("isiViolations<{} and shapeQuality>{} and astrRegion!='undetermined'".format(ISIcutoff, qualityThreshold))

    maxZscore = goodQualCells.ZscoreEachIntensity.apply(lambda x : np.max(np.abs(x)))
    goodRespCells=goodQualCells[maxZscore >= maxZThreshold]


    # -- Plot reports -- #
    outputDir = '/home/languo/data/ephys/head_fixed_astr/all_mice/responsive_freqs_by_hemi/'
    if not os.path.exists(outputDir):
        os.mkdir(outputDir)
Beispiel #36
0
This script assigns all the other galaxy properties like baryonic mass and 
effective radius to each galaxy in the mock catalog
"""

from progressbar import ProgressBar
import pandas as pd
import numpy as np

### Reading text files
Mr_vpeak_catalog = pd.read_csv('../data/SHAM_parallel.csv', \
                               delimiter='\t', header=None, \
                               names=['vpeak','M_r'])
eco_obs_catalog = pd.read_csv('../data/gal_Lr_Mb_Re.txt',\
                              delimiter='\s+',header=None,skiprows=2,\
                              names=['M_r','logmbary','Re'])
halocat_galcat_merged = pd.read_hdf('../data/halo_gal_Vishnu_Rockstar_macc.h5',\
                                    key='halocat_galcat_merged')
colnames = halocat_galcat_merged.columns

Mr_vpeak_catalog = Mr_vpeak_catalog.sort_values('M_r')
eco_obs_catalog = eco_obs_catalog.loc[eco_obs_catalog.Re.values >= 0]
eco_obs_catalog = eco_obs_catalog.sort_values('M_r')

pbar = ProgressBar()
nearest_match_idx_arr = []
mbary_arr = []
re_arr = []
np.random.seed(0)
for mag_value in pbar(Mr_vpeak_catalog.M_r.values):
    diff_arr = np.abs(eco_obs_catalog.M_r.values - mag_value)
    nearest_match_idx = np.where(diff_arr == diff_arr.min())[0]
    if len(nearest_match_idx) > 1:
Beispiel #37
0
def plot_ROC(prediction,
             pinfo,
             ensemble=1,
             label_type=None,
             output_png=None,
             output_tex=None,
             output_csv=None):
    # Convert the inputs to the correct format
    if type(prediction) is list:
        prediction = ''.join(prediction)

    if type(pinfo) is list:
        pinfo = ''.join(pinfo)

    if type(ensemble) is list:
        ensemble = int(ensemble[0])
        # ensemble = ''.join(ensemble)

    if type(output_png) is list:
        output_png = ''.join(output_png)

    if type(output_csv) is list:
        output_csv = ''.join(output_csv)

    if type(output_tex) is list:
        output_tex = ''.join(output_tex)

    if type(label_type) is list:
        label_type = ''.join(label_type)

    # Read the inputs
    prediction = pd.read_hdf(prediction)
    if label_type is None:
        # Assume we want to have the first key
        label_type = prediction.keys()[0]
    N_1 = len(prediction[label_type].Y_train[0])
    N_2 = len(prediction[label_type].Y_test[0])

    # Determine the predicted score per patient
    print('Determining score per patient.')
    y_truths, y_scores, _, _ = plot_SVM(prediction,
                                        pinfo,
                                        label_type,
                                        show_plots=False,
                                        alpha=0.95,
                                        ensemble=ensemble,
                                        output='decision')

    # Plot the ROC with confidence intervals
    print("Plotting the ROC with confidence intervals.")
    plot = 'default'
    f, fpr, tpr = plot_ROC_CIc(y_truths, y_scores, N_1, N_2)

    if plot == 'default':
        plot = ''

    # Save the outputs
    if output_png is not None:
        f.savefig(output_png)
        print(("ROC saved as {} !").format(output_png))

    if output_tex is not None:
        tikz_save(output_tex)
        print(("ROC saved as {} !").format(output_tex))

    # Save ROC values as JSON
    if output_csv is not None:
        with open(output_csv, 'wb') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(['FPR', 'TPR'])
            for i in range(0, len(fpr)):
                data = [str(fpr[i]), str(tpr[i])]
                writer.writerow(data)

        print(("ROC saved as {} !").format(output_csv))

    return f, fpr, tpr
Beispiel #38
0
import _pickle as cPickle
import matplotlib.cm as cm
import os
import matplotlib.gridspec as gridspec

###############################################################################################################
# TO LOAD
###############################################################################################################
data_directory = '/mnt/DataGuillaume/MergedData/'
datasets = np.loadtxt(data_directory + 'datasets_ThalHpc.list',
                      delimiter='\n',
                      dtype=str,
                      comments='#')

# WHICH NEURONS
mappings = pd.read_hdf("/mnt/DataGuillaume/MergedData/MAPPING_NUCLEUS.h5")
firing_rate = pd.read_hdf("/mnt/DataGuillaume/MergedData/FIRING_RATE_ALL.h5")
hd_index = mappings.index[np.where(mappings['hd'] == 1)[0]]
hd_index = hd_index[np.where((firing_rate.loc[hd_index] > 1.0).all(axis=1))[0]]

# SWR MODULATION
swr_mod, swr_ses = loadSWRMod(
    '/mnt/DataGuillaume/MergedData/SWR_THAL_corr.pickle',
    datasets,
    return_index=True)
nbins = 400
binsize = 5
times = np.arange(0, binsize * (nbins + 1), binsize) - (nbins * binsize) / 2
swr = pd.DataFrame(columns=swr_ses,
                   index=times,
                   data=gaussFilt(swr_mod, (1, )).transpose())
Beispiel #39
0
import numpy as np
import pandas, cctbx, scitbx
from dials.array_family import flex
from cctbx import sgtbx, crystal

# load the data!
print "loading data"
data_f = "/reg/d/psdm/cxi/cxid9114/res/dermen/reflection_2colorspec.hdf5"
df = pandas.read_hdf( data_f,"reflections")
                
sg = sgtbx.space_group(" P 4nw 2abw")
Symm = crystal.symmetry( unit_cell=(79,79,38,90,90,90), space_group=sg)

print "querying"
df = df.query("BnotA")  
#df = df.query("intens2 < 5000")

print "hkl"
hkls = tuple( map( tuple, df[['hB','kB','lB']].values))

intens = np.ascontiguousarray(df.intens5.values)
data = flex.double(intens)
sigmas = flex.double( np.sqrt(intens))

mil_idx = flex.miller_index(hkls)


mill_set = cctbx.miller.set( crystal_symmetry=Symm, 
                indices=mil_idx, anomalous_flag=True)
mill_ar = cctbx.miller.array(mill_set, data=data, sigmas=sigmas)\
            .set_observation_type_xray_intensity()
Beispiel #40
0
def test_trigger_type_in_dl1_params():
    from lstchain.io.io import dl1_params_lstcam_key
    params = pd.read_hdf(dl1_file, key=dl1_params_lstcam_key)
    assert 'trigger_type' in params.columns
from __future__ import print_function
from statsmodels.compat import iteritems, cStringIO

import numpy as np
import pandas as pd

sio = cStringIO.StringIO()
c = pd.read_hdf('kpss_critical_values.h5', 'c')
ct = pd.read_hdf('kpss_critical_values.h5', 'ct')

data = {'c': c, 'ct': ct}
for k, v in iteritems(data):
    n = v.shape[0]
    selected = np.zeros((n, 1), dtype=np.bool)
    selected[0] = True
    selected[-1] = True
    selected[v.index == 10.0] = True
    selected[v.index == 5.0] = True
    selected[v.index == 2.5] = True
    selected[v.index == 1.0] = True
    max_diff = 1.0
    while max_diff > 0.05:
        xp = np.squeeze(v[selected].values)
        yp = np.asarray(v[selected].index, dtype=np.float64)
        x = np.squeeze(v.values)
        y = np.asarray(v.index, dtype=np.float64)
        yi = np.interp(x, xp, yp)
        abs_diff = np.abs(y - yi)
        max_diff = np.max(abs_diff)
        if max_diff > 0.05:
            selected[np.where(abs_diff == max_diff)] = True
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# Constants
FILE_PATH_TRAIN = "./input/train.h5"

# Validation Size
TEST_SIZE = 0.2

# read files
training_data = pd.read_hdf(FILE_PATH_TRAIN, "train")

# training data
# extracting the x-values 
x_values_training = training_data.copy()
x_values_training = x_values_training.drop(labels=['y'], axis=1)
x_component_training = x_values_training.values

# extracting the y-values
y_component_training = training_data['y'].values

# training the scaler
scaler = StandardScaler(with_mean=True, with_std=True)
scaler = scaler.fit(x_component_training)

# scaling the training and test data
Beispiel #43
0
import pandas as pd
import numpy as np
import sklearn
from sklearn import neural_network
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

train_labeled = pd.read_hdf("train_labeled.h5", "train")
train_unlabeled = pd.read_hdf("train_unlabeled.h5", "train")
test = pd.read_hdf("test.h5", "test")

train_labeled = np.array(train_labeled)

features = train_labeled[:, 1:129]

lables = train_labeled[:, 0]

print(lables)
print(features)
print(lables.size)
print(features.size)

nn = neural_network.MLPClassifier(hidden_layer_sizes=(256, ),
                                  activation='relu',
                                  solver='adam',
                                  alpha=1,
                                  learning_rate='constant',
                                  learning_rate_init=0.0001,
                                  power_t=0.5,
Beispiel #44
0
########## Buffer L T21 @3mM MgCl2 + 150 NaCl
### exp200
labels.extend(['Cy3b_R1-36#:R1s1-8:40nM_L21_exp200_p038uW'])
labels.extend(['Cy3b_R1-36#:R1s1-8:40nM_L21_exp200_p114uW'])
labels.extend(['Cy3b_R1-36#:R1s1-8:40nM_L21_exp200_p250uW'])

dir_names = [
    '/fs/pool/pool-schwille-paint/Analysis/p06.SP-tracking/immobile/tracking-handle/z.datalog'
] * len(labels)
############################################################## Read in data
#### Load & Sorting
path = [
    os.path.join(dir_names[i], labels[i] + '_stats.h5')
    for i in range(0, len(labels))
]
X = pd.concat([pd.read_hdf(p, key='result') for p in path])

X.sort_index(axis=0, ascending=True, inplace=True)
X.sort_index(axis=1, ascending=True, inplace=True)

#%%
############################################################## Quick selection
field = 'Tn=1e+00'
istrue = (X.power == 38) & (X.exp == 400)
Xred = X.loc[istrue, field]
#%%
exp = 200
buffers = ['B', 'L', 'L21']
colors = ['r', 'b', 'k', 'magenta']

############################################################## Plotting half
    def get_pwi(self, segment, source, sink):
        pwi = self.segment_stores[segment].loc[source, sink]

        return pwi

    def save_output(self):
        nx.write_gpickle(self.G, '{0} Graph with PWIs.pkl'.format(self.handle))

    def run(self):
        for sc, sk, d in self.G.edges(data=True):
            for seg, val in d['segments'].items():
                pwi = self.get_pwi(seg, sc, sk)
                self.G.edge[sc][sk]['segments'][seg] = pwi
            print(self.G.edge[sc][sk])

        self.save_output()


if __name__ == '__main__':
    handle = sys.argv[1]

    segment_stores = dict()
    for i in range(1, 9):
        print('Getting segment {0} store.'.format(i))
        segment_stores[i] = pd.read_hdf(
            '{0} Thresholded Segment Affmats.h5'.format(handle),
            key='segment{0}'.format(i))

    gc = GraphPWIFinder(handle, segment_stores)
    gc.run()
Beispiel #46
0
def main(args, vers=None):

    # Load Events from HDF5 database of .singlepulse file
    try:
        events = pd.read_hdf(args.filename, 'events')
        with pd.HDFStore(args.filename) as store:
            db_keys = store.keys()
        file_type = 'hdf5'
    except (IOError, HDF5ExtError) as e:
        events = Events.Loader(args)
        file_type = 'sp'

    # Select events within the defined ranges
    if args.SNR_min is not None: events = events[events.Sigma >= args.SNR_min]

    if events.empty:
        print "No events found. Exiting"
        return

    # Load meta data
    if args.meta_data is not None:
        meta_data = Events.meta_data_Loader(args.meta_data)
        if vers is not None: meta_data['version'] = vers
        meta_data['File'] = os.path.basename(args.filename)
        if args.no_store is not None:
            meta_data.to_hdf(args.store_name, 'meta_data')
    elif file_type == 'hdf5':
        if '/meta_data' in db_keys:
            meta_data = pd.read_hdf(args.filename, 'meta_data')
    else:
        meta_data = None

    # Load Pulses
    if not args.no_search: pulses = Pulses.Loader(events, args)
    elif file_type == 'hdf5':
        if '/pulses' in db_keys:
            pulses = pd.read_hdf(args.filename, 'pulses')
        else:
            print "Pulses not present in the database. Exiting"
            return
    else:
        print "Events have been loaded and stored into the HDF5 file. Exiting"
        return

    if not args.no_filter: pulses = pulses[pulses.Rank == 0]

    # Select pulses within the defined ranges
    if args.t_range is not None:
        pulses = pulses[(pulses.Time >= args.t_range[0])
                        & (pulses.Time <= args.t_range[1])]
    if args.DM_range is not None:
        pulses = pulses[(pulses.DM >= args.DM_range[0])
                        & (pulses.DM <= args.DM_range[1])]
    if args.SNR_min is not None:
        pulses = pulses[pulses.Sigma >= args.SNR_peak_min]
    if args.N_min is not None: pulses = pulses[pulses.N_events >= args.N_min]

    if pulses.empty:
        print "No pulses found. Exiting"
        return

    # Load Candidates
    cands = Candidates.Loader(pulses, args)
    if not args.no_search or not args.no_store:
        cands.to_hdf(args.store_name, 'candidates')

    cands = cands[cands.main_cand == 0]

    if cands.empty:
        print "No candidates found. Exiting"
        return

    if cands.shape[0] > 100:
        print "{} candidates found, only the brightest 100 will be processed.".format(
            cands.shape[0])
        cands = cands.head(100)

    cands = cands[((cands.N_pulses == 1) &
                   (cands.Sigma >= args.single_cand_SNR)) |
                  ((cands.N_pulses > 1) &
                   (cands.Sigma >= args.multiple_cand_SNR))]
    cands.sort_values('Sigma', inplace=True, ascending=False)

    #Produce the output
    if not args.no_plot:
        LSPplot.output(args, events, pulses, cands, meta_data)

    return
Beispiel #47
0
args = parser.parse_args()

# Grep all database files
from pathlib import Path
files = []
for path in Path(args.path).rglob('*.hdf'):
    files.append(path)
for path in Path(args.path).rglob('*.hd5'):
    files.append(path)

if len(files) == 0:
    print('No new files found')
    exit()

for f in files:
    # target output
    desc = f  #.relative_to(args.path)
    output = f'{desc.parent}/{desc.stem}.parquet'

    # Original
    print(f'Converting from: {f}')
    df = pd.read_hdf(f)

    # execute
    print(f'Converting to:{output}')
    df.to_parquet(output)

    # FOLD
    print(f'RM: {f}')
    os.remove(f)
Beispiel #48
0
def load_df(dirpath, filename, varname=None):
    varname = filename if varname is None else varname
    fn = os.path.join(dirpath, filename)
    return read_hdf(fn, varname)
Beispiel #49
0
    model.add(Dense(units=output_size))
    model.add(Activation(activ_func))

    model.compile(loss=loss, optimizer=optimizer)
    return model
"""
if __name__ == "__main__":
    with open('tickerlist.pkl', 'rb') as f:
        tickerlist = pickle.load(f)

    train, test, X_train, X_test, y_train, y_test = [], [], [], [], [], []

    for symbol in tickerlist:
        #symbol = 'ADABTC'
        print(symbol)
        df = pd.read_hdf("added_params/" + symbol + ".h5")
        df = df.drop(['time'], axis=1)
        df = df.dropna()

        l_train, l_test, l_X_train, l_X_test, l_y_train, l_y_test = prepare_data_high(
            df,
            target_col='high',
            window_len=window_len,
            zero_base=zero_base,
            test_size=test_size)

        if len(l_y_test) > 10:
            symbolhighdata = {}
            symbolhighdata['l_train'] = l_train
            symbolhighdata['l_test'] = l_test
            symbolhighdata['l_X_train'] = l_X_train
Beispiel #50
0
def main(infold, mns_path, epsg_code, outfile, titre):
    """Main plotting function."""
    files = {}
    for file in infold.iterdir():
        files.update({file.name: file})

    # Extract the info
    nb = list(files.keys())[0].split("_")[-1].split(".")[0]
    width = list(files.keys())[0].split("_")[-3]
    azi = list(files.keys())[0].split("_")[-5]

    # Open gdal raster
    MNS_data, MNS_gt, MNS_ds = open_large_raster(str(mns_path))

    # Open transect
    data_tr = {}
    for fname, pth in files.items():
        if "transect" in fname:
            with open(pth, "rb") as f:
                transects = pickle.load(f)
        else:
            data_tr.update({"_".join(fname.split("_")[0:2]): pd.read_hdf(pth)})

    # Get very approximate center of transects
    midishline = transects[int(len(transects) / 2)]
    mid_point = midishline.interpolate(0.5, normalized=True)
    midpoint_buffer = mid_point.buffer(midishline.length / 2)
    envelope = midpoint_buffer.envelope

    # Turn interactive plotting off
    plt.ioff()

    # Create figure
    fig = plt.figure(figsize=(15.4, 6.6))
    fig.suptitle(titre)

    # Epsg
    proj_code = ccrs.epsg(epsg_code)

    # 2 by 2 grid
    gs = GridSpec(ncols=3, nrows=2, figure=fig, width_ratios=[0.1, 1.5, 4])
    ax = plt.subplot(gs[0, 1], projection=proj_code)
    ax1 = plt.subplot(gs[:, 0])
    ax2 = plt.subplot(gs[-1, 1], projection=proj_code)
    ax3 = plt.subplot(gs[:, -1])

    # AX
    mns_masked = np.ma.masked_where(MNS_data < 0, MNS_data)
    extent = (
        MNS_gt[0],
        MNS_gt[0] + MNS_ds.RasterXSize * MNS_gt[1],
        MNS_gt[3] + MNS_ds.RasterYSize * MNS_gt[5],
        MNS_gt[3],
    )

    ax.imshow(
        mns_masked, extent=extent, origin="upper", cmap="gist_earth"
    )

    ax.plot(
        [midishline.coords[0][0], midishline.coords[-1][0]],
        [midishline.coords[0][1], midishline.coords[-1][1]],
        linestyle="-",
        color="red",
        linewidth=1,
    )

    norm = Normalize(vmin=np.min(mns_masked), vmax=np.max(mns_masked))
    cbar = ColorbarBase(
        ax1, cmap=plt.get_cmap("gist_earth"), norm=norm, orientation="vertical"
    )
    cbar.ax.yaxis.set_label_position("left")
    cbar.ax.set_ylabel("Altitude / m")

    # AX2
    ax2.imshow(
        mns_masked, extent=extent, origin="upper", cmap="gist_earth"
    )

    for line in transects:
        ax2.plot(
            [line.coords[0][0], line.coords[-1][0]],
            [line.coords[0][1], line.coords[-1][1]],
            linestyle="-",
            color="black",
            alpha=0.6,
            linewidth=0.5,
        )

    ax2.set_extent(
        [
            envelope.bounds[0],
            envelope.bounds[2],
            envelope.bounds[1],
            envelope.bounds[-1],
        ],
        crs=ccrs.epsg(epsg_code),
    )

    ax2.set_title("Zoom on transects", y=-0.2)
    # AX3

    # Plot MNT/ MNS ground
    data_tr["MNT_solnu"].T.plot(
        ax=ax3, color="sienna", alpha=0.1, legend=False
    )
    data_tr["MNT_solnu"].T.mean(axis=1).plot(
        ax=ax3, color="sienna", legend=True, label="Mean summer DTM"
    )

    data_tr["MNS_solnu"].T.plot(
        ax=ax3, color="lightgreen", alpha=0.1, legend=False
    )
    data_tr["MNS_solnu"].T.mean(axis=1).plot(
        ax=ax3, color="lightgreen", legend=True, label="Mean summer DSM"
    )

    # Plot MNS neige
    data_tr["MNS_neige"].T.plot(
        ax=ax3, color="midnightblue", alpha=0.2, legend=False
    )
    data_tr["MNS_neige"].T.mean(axis=1).plot(
        ax=ax3, color="midnightblue", legend=True, label="Mean winter DSM"
    )

    ax3.set_title(
        "Azimuth: %s°, Width: %sm, # of transects: %s" % (azi, width, nb)
    )
    ax3.set_xlabel("Distance along transect / m")
    ax3.set_ylabel("Altitude / m")
    ax3.set_xlim(0, midishline.length)
    ax3.set_ylim(
        np.nanmin(data_tr["MNT_solnu"].T.mean(axis=1)) - 5,
        np.nanmax(data_tr["MNS_neige"].T.mean(axis=1)) + 5,
    )

    ax3.xaxis.set_major_locator(MultipleLocator(10))
    ax3.xaxis.set_minor_locator(MultipleLocator(5))
    ax3.yaxis.set_major_locator(MultipleLocator(1))
    ax3.yaxis.set_minor_locator(MultipleLocator(0.5))
    ax3.xaxis.set_ticks_position("both")
    ax3.yaxis.set_ticks_position("both")
    ax3.tick_params(direction="inout", which="both")

    fig.savefig(infold.joinpath(outfile), bbox_inches="tight", dpi=300)
Beispiel #51
0
def read_hdf_data_psi(path = 'premix_data', key='of_tables', in_labels=['zeta','f','pv'], labels = ['T'], scaler = None):
    # read in the hdf5 file
    # AND COMPUTE PSI OF THE MIXTURE
    try:
        df = pd.read_hdf(path,key=key)
    except:
        print('Check the data path and key')

    # read the molar weigths
    with open('molar_weights.json', 'r') as fp:
        molar_weights = json.load(fp)

    # read in the order of the species names
    with open('GRI_species_order') as f:
         all_species = f.read().splitlines()

    # numpy array of species molar weights
    molar_weights_np = np.array([molar_weights[s] for s in all_species])
    molar_weights_np = molar_weights_np/ 1000   # conversion from g to kg! This is needed for OpenFOAM
    T_vector = df['T'].as_matrix()

    # convert to ndarray
    gri_mass_frac = df[all_species].as_matrix()

    # COMPUTE THE CORRECT PSI VALUE
    R_universal = 8.314459
    psi_list = []

    print('Starting to compute psi ... ')
    # iterate over all rows
    for index in range(0,df.shape[0]):
        R_m = R_universal * sum(gri_mass_frac[index,:] / molar_weights_np)
        #df['psi'].iloc[index] = 1 / (R_m * row['T'])
        psi_list.append(1/(R_m * T_vector[index]))
        # print(index)

    # hand back the data to df
    df['psi'] = psi_list
    print('Done with psi!\n')

    input_df=df[in_labels]

    if scaler=='MinMax':
        in_scaler = preprocessing.MinMaxScaler()
        out_scaler = preprocessing.MinMaxScaler()
    elif scaler=='Standard':
        in_scaler = preprocessing.StandardScaler()
        out_scaler = preprocessing.StandardScaler()
    else:
        raise ValueError('Only possible scalers are: MinMax or Standard.')

    input_np = in_scaler.fit_transform(input_df)

    label_df=df[labels]

    label_np = out_scaler.fit_transform(label_df)
    print('\n*******************************')
    print('The scaler is %s\n' % scaler)
    print('This is the order of the labels:')
    [print(f) for f in labels]
    print('*******************************\n')
    return input_np, label_np, df, in_scaler, out_scaler
 def get_extended_stimulus_presentations_df(self):
     return pd.read_hdf(self.extended_stimulus_presentations_df_path,
                        key='df')
def load_equities():
    return pd.read_hdf(custom_data_path / 'stooq.h5', 'jp/equities')
Beispiel #54
0
def test_write_dl2_dataframe():
    from lstchain.tests.test_lstchain import dl2_file, test_dir
    from lstchain.io.io import dl2_params_lstcam_key
    dl2 = pd.read_hdf(dl2_file, key=dl2_params_lstcam_key)
    from lstchain.io import write_dl2_dataframe
    write_dl2_dataframe(dl2, os.path.join(test_dir, 'dl2_test.h5'))
import pandas as pd
import urllib.request

# Read from h5
df = pd.read_hdf('./02_io_tools/hdfstore.h5', 'd1')
print(df.head())

# Create JSON
df.to_json('./02_io_tools/example_json.json')

# Read JSON
df2 = pd.read_json('./02_io_tools/example_json.json')
print(df2.head())

# Request to read from Giphy Public API - Trending
get_market_history_json = urllib.request.urlopen(
    'https://api.bittrex.com/api/v1.1/public/getmarkethistory?market=USD-BTC'
).read()
get_market_history_df = pd.read_json(get_market_history_json)

print(get_market_history_json)
def AnalyzeVideosSession(video_dir):
    """
    DeepLabCut Toolbox
    https://github.com/AlexEMG/DeepLabCut

    A Mathis, [email protected]
    M Mathis, [email protected]

    This script analyzes videos based on a trained network (as specified in myconfig_analysis.py)

    You need tensorflow for evaluation. Run by:

    python3 AnalyzeVideosSession.py video_dir

    Functionalized by Adam S. Lowet, 10/25/19
    """

    ####################################################
    # Dependencies
    ####################################################

    import os.path
    import sys

    subfolder = os.getcwd().split('analysis-tools')[0]
    sys.path.append(subfolder)
    # add parent directory: (where nnet & config are!)
    sys.path.append(os.path.join(subfolder, "pose-tensorflow"))
    sys.path.append(os.path.join(subfolder, "config"))

    from myconfig_analysis import cropping, Task, date, \
        trainingsFraction, resnet, snapshotindex, shuffle,x1, x2, y1, y2, videotype, storedata_as_csv

    # Deep-cut dependencies
    from config import load_config
    from nnet import predict
    from dataset.pose_dataset import data_to_input

    # Dependencies for video:
    import pickle
    # import matplotlib.pyplot as plt
    import imageio
    from skimage.util import img_as_ubyte
    from moviepy.editor import VideoFileClip
    import skimage
    import skimage.color
    import time
    import pandas as pd
    import numpy as np
    import os
    from tqdm import tqdm

    def getpose(image, cfg, outputs, outall=False):
        ''' Adapted from DeeperCut, see pose-tensorflow folder'''
        image_batch = data_to_input(skimage.color.gray2rgb(image))
        outputs_np = sess.run(outputs, feed_dict={inputs: image_batch})
        scmap, locref = predict.extract_cnn_output(outputs_np, cfg)
        pose = predict.argmax_pose_predict(scmap, locref, cfg.stride)
        if outall:
            return scmap, locref, pose
        else:
            return pose

    ####################################################
    # Loading data, and defining model folder
    ####################################################

    basefolder = os.path.join('..', '..', 'pose-tensorflow', 'models')
    modelfolder = os.path.join(
        basefolder, Task + str(date) + '-trainset' +
        str(int(trainingsFraction * 100)) + 'shuffle' + str(shuffle))

    cfg = load_config(os.path.join(modelfolder, 'test', "pose_cfg.yaml"))

    ##################################################
    # Load and setup CNN part detector
    ##################################################

    # Check which snapshots are available and sort them by # iterations
    Snapshots = np.array([
        fn.split('.')[0]
        for fn in os.listdir(os.path.join(modelfolder, 'train'))
        if "index" in fn
    ])
    increasing_indices = np.argsort([int(m.split('-')[1]) for m in Snapshots])
    Snapshots = Snapshots[increasing_indices]

    print(modelfolder)
    print(Snapshots)

    ##################################################
    # Compute predictions over images
    ##################################################

    # Check if data already was generated:
    cfg['init_weights'] = os.path.join(modelfolder, 'train',
                                       Snapshots[snapshotindex])

    # Name for scorer:
    trainingsiterations = (cfg['init_weights'].split('/')[-1]).split('-')[-1]

    # Name for scorer:
    scorer = 'DeepCut' + "_resnet" + str(resnet) + "_" + Task + str(
        date) + 'shuffle' + str(shuffle) + '_' + str(trainingsiterations)

    cfg['init_weights'] = os.path.join(modelfolder, 'train',
                                       Snapshots[snapshotindex])
    sess, inputs, outputs = predict.setup_pose_prediction(cfg)
    pdindex = pd.MultiIndex.from_product(
        [[scorer], cfg['all_joints_names'], ['x', 'y', 'likelihood']],
        names=['scorer', 'bodyparts', 'coords'])

    ##################################################
    # Datafolder
    ##################################################

    # video_dir='../videos/' #where your folder with videos is.
    frame_buffer = 10

    os.chdir(video_dir)
    videos = np.sort([fn for fn in os.listdir(os.curdir) if (videotype in fn)])
    print("Starting ", video_dir, videos)
    for video in videos:
        dataname = video.split('.')[0] + scorer + '.h5'
        try:
            # Attempt to load data...
            pd.read_hdf(dataname)
            print("Video already analyzed!", dataname)
        except FileNotFoundError:
            print("Loading ", video)
            clip = VideoFileClip(video)
            ny, nx = clip.size  # dimensions of frame (height, width)
            fps = clip.fps
            #nframes = np.sum(1 for j in clip.iter_frames()) #this is slow (but accurate)
            nframes_approx = int(
                np.ceil(clip.duration * clip.fps) + frame_buffer)
            # this will overestimage number of frames (see https://github.com/AlexEMG/DeepLabCut/issues/9) This is especially a problem
            # for high frame rates and long durations due to rounding errors (as Rich Warren found). Later we crop the result (line 187)

            if cropping:
                clip = clip.crop(y1=y1, y2=y2, x1=x1,
                                 x2=x2)  # one might want to adjust

            print("Duration of video [s]: ", clip.duration, ", recorded with ",
                  fps, "fps!")
            print("Overall # of frames: ", nframes_approx,
                  "with cropped frame dimensions: ", clip.size)

            start = time.time()
            PredicteData = np.zeros(
                (nframes_approx, 3 * len(cfg['all_joints_names'])))
            clip.reader.initialize()
            print("Starting to extract posture")
            for index in tqdm(range(nframes_approx)):
                #image = img_as_ubyte(clip.get_frame(index * 1. / fps))
                image = img_as_ubyte(clip.reader.read_frame())
                # Thanks to Rick Warren for the  following snipplet:
                # if close to end of video, start checking whether two adjacent frames are identical
                # this should only happen when moviepy has reached the final frame
                # if two adjacent frames are identical, terminate the loop
                if index == int(nframes_approx - frame_buffer * 2):
                    last_image = image
                elif index > int(nframes_approx - frame_buffer * 2):
                    if (image == last_image).all():
                        nframes = index
                        print("Detected frames: ", nframes)
                        break
                    else:
                        last_image = image
                pose = getpose(image, cfg, outputs)
                PredicteData[index, :] = pose.flatten(
                )  # NOTE: thereby cfg['all_joints_names'] should be same order as bodyparts!

            stop = time.time()

            dictionary = {
                "start": start,
                "stop": stop,
                "run_duration": stop - start,
                "Scorer": scorer,
                "config file": cfg,
                "fps": fps,
                "frame_dimensions": (ny, nx),
                "nframes": nframes
            }
            metadata = {'data': dictionary}

            print("Saving results...")
            DataMachine = pd.DataFrame(
                PredicteData[:nframes, :],
                columns=pdindex,
                index=range(
                    nframes))  #slice pose data to have same # as # of frames.
            DataMachine.to_hdf(dataname,
                               'df_with_missing',
                               format='table',
                               mode='w')

            if storedata_as_csv:
                DataMachine.to_csv(video.split('.')[0] + scorer + '.csv')

            with open(
                    dataname.split('.')[0] + 'includingmetadata.pickle',
                    'wb') as f:
                pickle.dump(metadata, f, pickle.HIGHEST_PROTOCOL)
def load_data():
    return pd.read_hdf("./raw_data/data.hdf", "master")
Beispiel #58
0
from sklearn.model_selection import ShuffleSplit, RepeatedKFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeCV

input_path = "/storage/inria/agramfor/camcan_derivatives"

bands = [
    'alpha', 'beta_high', 'beta_low', 'delta', 'gamma_high', 'gamma_lo',
    'gamma_mid', 'low', 'theta'
]

# assemble matrixes
data = list()
for band in bands:
    data.append(
        pd.read_hdf(op.join(input_path, f'mne_source_power_diag-{band}.h5'),
                    'mne_power_diag'))

data = pd.concat(data, axis=1)
subjects = data.index.values
# use subjects we used in previous nips submission
new_subjects = ['CC510256', 'CC520197', 'CC610051', 'CC121795', 'CC410182']

mask = ~np.in1d(subjects, new_subjects)
subjects = subjects[mask]

X = data.values[mask]

participants_fname = op.join(cfg.derivative_path, "participants.csv")
participants = pd.read_csv(participants_fname)
y = participants.set_index('Observations').age.loc[subjects].values
Beispiel #59
0
def ExtractFramesbasedonPreselection(
    Index,
    extractionalgorithm,
    data,
    video,
    cfg,
    config,
    opencv=True,
    cluster_resizewidth=30,
    cluster_color=False,
    savelabeled=True,
    with_annotations=True,
):
    from deeplabcut.create_project import add

    start = cfg["start"]
    stop = cfg["stop"]
    numframes2extract = cfg["numframes2pick"]
    bodyparts = auxiliaryfunctions.IntersectionofBodyPartsandOnesGivenbyUser(
        cfg, "all")

    videofolder = str(Path(video).parents[0])
    vname = str(Path(video).stem)
    tmpfolder = os.path.join(cfg["project_path"], "labeled-data", vname)
    if os.path.isdir(tmpfolder):
        print("Frames from video", vname,
              " already extracted (more will be added)!")
    else:
        auxiliaryfunctions.attempttomakefolder(tmpfolder, recursive=True)

    nframes = len(data)
    print("Loading video...")
    if opencv:
        vid = VideoWriter(video)
        fps = vid.fps
        duration = vid.calc_duration()
    else:
        from moviepy.editor import VideoFileClip

        clip = VideoFileClip(video)
        fps = clip.fps
        duration = clip.duration

    if cfg["cropping"]:  # one might want to adjust
        coords = (cfg["x1"], cfg["x2"], cfg["y1"], cfg["y2"])
    else:
        coords = None

    print("Duration of video [s]: ", duration, ", recorded @ ", fps, "fps!")
    print("Overall # of frames: ", nframes,
          "with (cropped) frame dimensions: ")
    if extractionalgorithm == "uniform":
        if opencv:
            frames2pick = frameselectiontools.UniformFramescv2(
                vid, numframes2extract, start, stop, Index)
        else:
            frames2pick = frameselectiontools.UniformFrames(
                clip, numframes2extract, start, stop, Index)
    elif extractionalgorithm == "kmeans":
        if opencv:
            frames2pick = frameselectiontools.KmeansbasedFrameselectioncv2(
                vid,
                numframes2extract,
                start,
                stop,
                cfg["cropping"],
                coords,
                Index,
                resizewidth=cluster_resizewidth,
                color=cluster_color,
            )
        else:
            if cfg["cropping"]:
                clip = clip.crop(y1=cfg["y1"],
                                 y2=cfg["x2"],
                                 x1=cfg["x1"],
                                 x2=cfg["x2"])
            frames2pick = frameselectiontools.KmeansbasedFrameselection(
                clip,
                numframes2extract,
                start,
                stop,
                Index,
                resizewidth=cluster_resizewidth,
                color=cluster_color,
            )

    else:
        print(
            "Please implement this method yourself! Currently the options are 'kmeans', 'jump', 'uniform'."
        )
        frames2pick = []

    # Extract frames + frames with plotted labels and store them in folder (with name derived from video name) nder labeled-data
    print("Let's select frames indices:", frames2pick)
    colors = visualization.get_cmap(len(bodyparts), cfg["colormap"])
    strwidth = int(np.ceil(np.log10(nframes)))  # width for strings
    for index in frames2pick:  ##tqdm(range(0,nframes,10)):
        if opencv:
            PlottingSingleFramecv2(
                vid,
                cfg["cropping"],
                coords,
                data,
                bodyparts,
                tmpfolder,
                index,
                cfg["dotsize"],
                cfg["pcutoff"],
                cfg["alphavalue"],
                colors,
                strwidth,
                savelabeled,
            )
        else:
            PlottingSingleFrame(
                clip,
                data,
                bodyparts,
                tmpfolder,
                index,
                cfg["dotsize"],
                cfg["pcutoff"],
                cfg["alphavalue"],
                colors,
                strwidth,
                savelabeled,
            )
        plt.close("all")

    # close videos
    if opencv:
        vid.close()
    else:
        clip.close()
        del clip

    # Extract annotations based on DeepLabCut and store in the folder (with name derived from video name) under labeled-data
    if len(frames2pick) > 0:
        try:
            if cfg["cropping"]:
                add.add_new_videos(
                    config, [video],
                    coords=[coords])  # make sure you pass coords as a list
            else:
                add.add_new_videos(config, [video], coords=None)
        except:  # can we make a catch here? - in fact we should drop indices from DataCombined if they are in CollectedData.. [ideal behavior; currently this is pretty unlikely]
            print(
                "AUTOMATIC ADDING OF VIDEO TO CONFIG FILE FAILED! You need to do this manually for including it in the config.yaml file!"
            )
            print("Videopath:", video, "Coordinates for cropping:", coords)
            pass

        if with_annotations:
            machinefile = os.path.join(
                tmpfolder,
                "machinelabels-iter" + str(cfg["iteration"]) + ".h5")
            if isinstance(data, pd.DataFrame):
                df = data.loc[frames2pick]
                df.index = [
                    os.path.join(
                        "labeled-data",
                        vname,
                        "img" + str(index).zfill(strwidth) + ".png",
                    ) for index in df.index
                ]  # exchange index number by file names.
            elif isinstance(data, dict):
                idx = [
                    os.path.join(
                        "labeled-data",
                        vname,
                        "img" + str(index).zfill(strwidth) + ".png",
                    ) for index in frames2pick
                ]
                filename = os.path.join(str(tmpfolder),
                                        f"CollectedData_{cfg['scorer']}.h5")
                try:
                    df_temp = pd.read_hdf(filename, "df_with_missing")
                    columns = df_temp.columns
                except FileNotFoundError:
                    columns = pd.MultiIndex.from_product(
                        [
                            [cfg["scorer"]],
                            cfg["individuals"],
                            cfg["multianimalbodyparts"],
                            ["x", "y"],
                        ],
                        names=["scorer", "individuals", "bodyparts", "coords"],
                    )
                    if cfg["uniquebodyparts"]:
                        columns2 = pd.MultiIndex.from_product(
                            [
                                [cfg["scorer"]],
                                ["single"],
                                cfg["uniquebodyparts"],
                                ["x", "y"],
                            ],
                            names=[
                                "scorer", "individuals", "bodyparts", "coords"
                            ],
                        )
                        df_temp = pd.concat((
                            pd.DataFrame(columns=columns),
                            pd.DataFrame(columns=columns2),
                        ))
                        columns = df_temp.columns
                array = np.full((len(frames2pick), len(columns)), np.nan)
                for i, index in enumerate(frames2pick):
                    data_temp = data.get(index)
                    if data_temp is not None:
                        vals = np.concatenate(data_temp)[:, :2].flatten()
                        array[i, :len(vals)] = vals
                df = pd.DataFrame(array, index=idx, columns=columns)
            else:
                return
            if Path(machinefile).is_file():
                Data = pd.read_hdf(machinefile, "df_with_missing")
                DataCombined = pd.concat([Data, df])
                # drop duplicate labels:
                DataCombined = DataCombined[~DataCombined.index.duplicated(
                    keep="first")]

                DataCombined.to_hdf(machinefile,
                                    key="df_with_missing",
                                    mode="w")
                DataCombined.to_csv(
                    os.path.join(tmpfolder, "machinelabels.csv")
                )  # this is always the most current one (as reading is from h5)
            else:
                df.to_hdf(machinefile, key="df_with_missing", mode="w")
                df.to_csv(os.path.join(tmpfolder, "machinelabels.csv"))

        print(
            "The outlier frames are extracted. They are stored in the subdirectory labeled-data\%s."
            % vname)
        print(
            "Once you extracted frames for all videos, use 'refine_labels' to manually correct the labels."
        )
    else:
        print("No frames were extracted.")
Beispiel #60
0
def timegrid_one_batch(configs):
    batch_id = configs["batch_id"]

    with open(configs["pid_batch_file"], 'rb') as fp:
        obj = pickle.load(fp)
        batch_to_lst = obj["batch_to_lst"]
        batches = list(sorted(batch_to_lst.keys()))
        batch_idxs = batch_to_lst[batch_id]

    first_write = True
    create_static = configs["create_static"]
    create_dynamic = configs["create_dynamic"]
    print("Dispatched batch {} with {} patients".format(
        batch_id, len(batch_idxs)))

    for pidx, pid in enumerate(batch_idxs):

        if (pidx + 1) % 10 == 0:
            print("Progress in batch {}: {}/{}".format(batch_id, pidx + 1,
                                                       len(batch_idxs)))

        if create_static:
            static_extractor = eicu_static_tf.StaticExtractor()
            df_pat = pd.read_hdf(configs["input_patient_table"],
                                 mode='r',
                                 where="patientunitstayid={}".format(pid))
            df_adm = pd.read_hdf(configs["input_admission_table"],
                                 mode='r',
                                 where="patientunitstayid={}".format(pid))
            df_aav = pd.read_hdf(configs["input_apache_aps_var_table"],
                                 mode='r',
                                 where="patientunitstayid={}".format(pid))
            df_apr = pd.read_hdf(configs["input_apache_patient_result_table"],
                                 mode='r',
                                 where="patientunitstayid={}".format(pid))
            df_apv = pd.read_hdf(configs["input_apache_pred_var_table"],
                                 mode='r',
                                 where="patientunitstayid={}".format(pid))
            df_static = static_extractor.transform(df_pat,
                                                   df_adm,
                                                   df_aav,
                                                   df_apr,
                                                   df_apv,
                                                   pid=pid)

        if create_dynamic:
            lab_vars = []

            with open(configs["selected_lab_vars"], 'r') as fp:
                csv_fp = csv.reader(fp, delimiter='\t')
                next(csv_fp)
                for lab_name in csv_fp:
                    lab_vars.append(lab_name[0].strip())

            grid_model = eicu_tf_impute.Timegridder()
            grid_model.set_selected_lab_vars(lab_vars)

            quantile_fp = open(configs["quantile_dict"], mode='r')
            var_quantile_dict = json.load(quantile_fp)
            grid_model.set_quantile_dict(var_quantile_dict)
            quantile_fp.close()

            df_lab = pd.read_hdf(configs["input_lab_table"],
                                 mode='r',
                                 where="patientunitstayid={}".format(pid))
            df_vs = pd.read_hdf(configs["input_vital_periodic_table"],
                                mode='r',
                                where="patientunitstayid={}".format(pid))
            df_avs = pd.read_hdf(configs["input_vital_aperiodic_table"],
                                 mode='r',
                                 where="patientunitstayid={}".format(pid))
            df_out = grid_model.transform(df_lab, df_vs, df_avs, pid=pid)

        if first_write:

            if create_dynamic:
                df_out.to_hdf(os.path.join(configs["output_dynamic_dir"],
                                           "batch_{}.h5".format(batch_id)),
                              configs["output_dset_id"],
                              append=False,
                              data_columns=["patientunitstayid"],
                              mode='w',
                              format="table",
                              complevel=configs["hdf_comp_level"],
                              complib=configs["hdf_comp_alg"])

            if create_static:
                df_static.to_hdf(os.path.join(configs["output_static_dir"],
                                              "batch_{}.h5".format(batch_id)),
                                 configs["output_dset_id"],
                                 append=False,
                                 data_columns=["patientunitstayid"],
                                 mode='w',
                                 format="table",
                                 complevel=configs["hdf_comp_level"],
                                 complib=configs["hdf_comp_alg"])

        else:

            if create_dynamic:
                df_out.to_hdf(os.path.join(configs["output_dynamic_dir"],
                                           "batch_{}.h5".format(batch_id)),
                              configs["output_dset_id"],
                              append=True,
                              data_columns=["patientunitstayid"],
                              mode='a',
                              format="table",
                              complevel=configs["hdf_comp_level"],
                              complib=configs["hdf_comp_alg"])

            if create_static:
                df_static.to_hdf(os.path.join(configs["output_static_dir"],
                                              "batch_{}.h5".format(batch_id)),
                                 configs["output_dset_id"],
                                 append=True,
                                 data_columns=["patientunitstayid"],
                                 mode='a',
                                 format="table",
                                 complevel=configs["hdf_comp_level"],
                                 complib=configs["hdf_comp_alg"])

        first_write = False