def test_pivot_no_column(dataframe1): with pytest.raises(ValueError): odo(str(dataframe1), DataFrame, values="bar", annotate=True, index="uri")
def test_custom_annotate_df(dataframe1, dataframe2): def _annotate_fn(df, uri, **kwargs): uristr = os.path.basename(uri) df['sample'] = uristr.split(".")[0] return df df1 = odo(str(dataframe1), DataFrame, annotate=True, annotation_fn=_annotate_fn) df2 = df1.append(odo(str(dataframe2), DataFrame, annotate=True, annotation_fn=_annotate_fn)) assert set(df2['sample']) == {'sample1', 'sample2'}
def test_annotate_df_regex(dataframe1, dataframe2): df1 = odo(str(dataframe1), DataFrame, annotate=True, regex=".*(?P<sample>sample\d+)\.(?P<df>[a-z]+\d+)\.csv") df2 = df1.append( odo(str(dataframe2), DataFrame, annotate=True, regex=".*(?P<sample>sample\d+)\.(?P<df>[a-z]+\d+)\.csv")) assert set(df2['sample'] == {'sample1', 'sample2'}) assert set(df2['df'] == {'dataframe1', 'dataframe2'})
def test_custom_annotate_df(dataframe1, dataframe2): def _annotate_fn(df, uri, **kwargs): uristr = os.path.basename(uri) df['sample'] = uristr.split(".")[0] return df df1 = odo(str(dataframe1), DataFrame, annotate=True, annotation_fn=_annotate_fn) df2 = df1.append( odo(str(dataframe2), DataFrame, annotate=True, annotation_fn=_annotate_fn)) assert set(df2['sample']) == {'sample1', 'sample2'}
def test_pivot_no_value(dataframe1): df = odo(str(dataframe1), DataFrame, columns="bar", annotate=True, index="uri") assert df.index.name == "uri"
def __init__(self): super(StockModel, self).__init__() file_name = "notebooks/db2.bcolz" self.df = bz.odo(file_name, pd.DataFrame)[['Date', 'Close']] #[1000:1100] self.devol() self.returns_df = None
def bokeh_constructor( loader, node ): """ build a bokeh plot """ global workspace args = loader.construct_mapping(node, deep=True) args = resolve_pointer( workspace, args ) source = None if not 'figure' in args: args['figure'] = {} args['figure'] = resolve_pointer( workspace, args['figure'] ) if 'source' in args: source = blaze.odo( args['source'], ColumnDataSource ) p = figure( **args['figure'] ) for glyph, kwargs in yaml_to_args(args['glyphs']): if source: kwargs['source'] = source getattr( p, glyph )( **kwargs ) return p
def test_pivot_regex(dataframe1): df = odo(str(dataframe1), DataFrame, columns="foo", values="bar", index="sample", regex=".*(?P<sample>sample\d+)\.(?P<df>[a-z]+\d+)\.csv") assert list(df.index) == ["sample1"]
def show_table(): global datademo, source if request.args.get('data'): datademo = pd.read_csv( request.args.get('data') ) else: datademo = bz.odo( data['iris'], pd.DataFrame ) source = ColumnDataSource( datademo ) return render_template( 'layout.html')
def test_pivot_uri(dataframe1): df = odo(str(dataframe1), DataFrame, columns="foo", values="bar", index="uri", annotate=True) assert df.shape == (1, 2) assert list(df.columns) == [1, 3] assert df.index.name == "uri"
def tail(table, n=10, to_frame=False): """ Extract the end of a table. Args: n (int): Number of entries to extract to_frame (bool): Return a dataframe (default false) Returns: sliced: Blaze data object or pandas dataframe """ sliced = db[table].sort().tail(n) if to_frame: return bz.odo(sliced, pd.DataFrame) return sliced
def show_dataframe( data_key ): """ JSON pointer to access bz.Data """ convert = jp.resolve_pointer( data, '/'+ data_key.rstrip('/') ) if data_key.split('/')[0] in ['iris','accounts']: # Convert blaze object to html representation return bz.odo( convert , pd.DataFrame ).to_html() else: # Return YAML string return json.dumps(convert)
def annotate_df(infile, parser, groupnames=["SM"]): """Annotate a parsed odo unit. Assumes metadata information is stored in input file name. Args: infile (str): file name parser (re): regexp object to parse input file name with. Metadata information to parse is stored in file name groupnames (list): list of parser group names to use. For each name <name>, the parser should have a corresponding (?P<name>...) expression """ df = odo(infile, pd.DataFrame) m = parser.parse(infile) for name in groupnames: df[name] = str(m[name]) return df
def aggregate(self, key=None, **kwargs): """Aggregate targets to aggregate_targets, if present. Args: key (str): aggregate targets for key, if None all targets are aggregated kwargs (dict): additional arguments, passed to *all* hooks """ if not self.run: return self for k,v in self.iotargets.items(): smllogger.debug("Aggregating key {}, iotargets: {}".format(k, v)) if not key is None and not key == k: continue if self.iotargets[k][1] is None: smllogger.debug("Skipping iotarget key ", k) continue annotate = self._annotate if not self._annotation_funcs.get(k, None) is None: smllogger.debug("Annotating data") annotate = True try: dflist = [ odo(x, pd.DataFrame, annotate=annotate, annotation_fn=self._annotation_funcs.get(k, None), key=k, **kwargs) for x in self.targets[k] ] except: smllogger.warn("Unable to generate data frame list; not aggregating results for targets {}".format(self.targets[k])) return self # Run post-processing hooks, if any if not self._post_processing_hooks.get(k, None) is None: smllogger.debug("Running post processing hook") dflist = [self._post_processing_hooks[k](df, **kwargs) for df in dflist] df = pd.concat(dflist) # Run post-processing hook for aggregated data, if any if not self._aggregate_post_processing_hooks.get(k, None) is None: smllogger.debug("Running post processing hook on aggregated data") df = self._aggregate_post_processing_hooks[k](df, **kwargs) self._aggregate_data[k] = df return self
def _blaze(self, _selects, _wheres, _groups, _aggs, _offset, _limit, _sorts, _count, _q): import blaze as bz import datashape # TODO: Not caching blaze connections parameters = self.params.get('parameters', {}) bzcon = bz.Data( self.params['url'] + ('::' + self.params['table'] if self.params.get('table') else ''), **parameters) table = bz.Symbol('table', bzcon.dshape) columns = table.fields query = table if _wheres: wh_re = re.compile(r'([^=><~!]+)([=><~!]{1,2})([\s\S]+)') wheres = None for where in _wheres: match = wh_re.search(where) if match is None: continue col, oper, val = match.groups() col = table[col] if oper in ['==', '=']: whr = (col == val) elif oper == '>=': whr = (col >= val) elif oper == '<=': whr = (col <= val) elif oper == '>': whr = (col > val) elif oper == '<': whr = (col < val) elif oper == '!=': whr = (col != val) elif oper == '~': whr = (col.like('*' + val + '*')) elif oper == '!~': whr = (~col.like('*' + val + '*')) wheres = whr if wheres is None else wheres & whr query = query if wheres is None else query[wheres] alias_cols = [] if _groups and _aggs: byaggs = { 'min': bz.min, 'max': bz.max, 'sum': bz.sum, 'count': bz.count, 'mean': bz.mean, 'nunique': bz.nunique } agg_re = re.compile(r'([^:]+):([aA-zZ]+)\(([^:]+)\)') grps = bz.merge(*[query[group] for group in _groups]) aggs = {} for agg in _aggs: match = agg_re.search(agg) if match is None: continue name, oper, col = match.groups() alias_cols.append(name) aggs[name] = byaggs[oper](query[col]) query = bz.by(grps, **aggs) if _q: wheres = None for col in columns: if isinstance(table[col].dshape.measure.ty, datashape.coretypes.String): whr = table[col].like('*' + _q + '*') wheres = whr if wheres is None else wheres | whr if wheres is not None: query = query[wheres] count_query = query.count() if _sorts: order = {'asc': True, 'desc': False} sorts = [] for sort in _sorts: col, odr = sort.partition(':')[::2] if col not in columns + alias_cols: continue sorts.append(col) if sorts: query = query.sort(sorts, ascending=order.get(odr, True)) if _offset: _offset = int(_offset) if _limit: _limit = int(_limit) if _offset and _limit: _limit += _offset if _offset or _limit: query = query[_offset:_limit] if _selects: query = query[_selects] # TODO: Improve json, csv, html outputs using native odo result = { 'query': query, 'data': bz.odo(bz.compute(query, bzcon.data), pd.DataFrame), } if _count: count = bz.odo(bz.compute(count_query, bzcon.data), pd.DataFrame) result['count'] = count.iloc[0, 0] return result
xg_val = xgb.DMatrix(val_X, val_Y) # setup parameters for xgboost param = {} # use softmax multi-class classification param['objective'] = 'multi:softmax' # scale weight of positive examples param['eta'] = 0.1 param['max_depth'] = 6 param['silent'] = 1 param['nthread'] = 4 param['num_class'] = 5 num_round = 5 bst = xgb.train(param, xg_train, num_round) # get prediction val_pred = bst.predict(xg_val) # get metrics score = accuracy_score(val_Y, val_pred) # predict test_set and save as csv run_test = True if run_test: xg_test = xgb.DMatrix(test_features) test_labels = bst.predict(xg_test) test_labels_df = bz.Data(zip(test_imgs, np.int64(test_labels)), fields=['image', 'level']) bz.odo(test_labels_df, test_result)
# -*- coding: utf-8 -*- """ Created on Thu Mar 09 13:44:53 2017 @author: Sriva """ import blaze as bz #csv_path = 'C:/Users/Sriva/Desktop/George Mason/Spring 2017/DAEN 690/database/GNDTNDR.csv' #bz.odo(csv_path, 'sqlite:///data.db::data') csv_path = 'C:/Users/Sriva/Desktop/George Mason/Spring 2017/DAEN 690/database/GNDITEM.csv' bz.odo(csv_path, 'sqlite:///data.db::gnditem')
def test_qualimap_2_1_3(qualimap_data_2_1_3): df = odo(str(qualimap_data_2_1_3), DataFrame, key='Coverage_per_contig') assert list(df.columns) == ['chrlen', 'mapped_bases', 'mean_coverage', 'sd'] assert list(df.index) == ['chr11']
Data contains the data""", action="store_true", required=False) parser.add_argument('source', metavar="SOURCE", type=str, help='The source data to show',) if __name__ == "__main__": args = parser.parse_args() kwargs = {} for arg in args.args: if "=" in arg: key, value = arg.split("=") if value.startswith("int:"): value = int(value[len("int:"):]) kwargs[key] = value else: kwargs[arg] = True data = blaze.odo( args.source, pandas.DataFrame, **kwargs ) if args.ipython: import IPython dict_ = globals() dict_.update(locals()) IPython.start_ipython(argv=[], user_ns=dict_) data.plot() plt.show()
def test_annotate_df(dataframe1, dataframe2): df1 = odo(str(dataframe1), DataFrame, annotate=True) df2 = df1.append(odo(str(dataframe2), DataFrame, annotate=True)) assert set([os.path.basename(x) for x in df2['uri']]) == \ {'sample1.dataframe1.csv', 'sample2.dataframe2.csv'}
def test_metrics(align_metrics): metrics = odo(str(align_metrics), DataFrame) assert metrics.loc["FIRST_OF_PAIR"]["MEAN_READ_LENGTH"] == 76
from bokeh.models import ColumnDataSource from bokeh.resources import CDN from bokeh.protocol import serialize_json # Import interactive blaze data data = { 'iris': bz.Data(bz.CSV('static/iris.data.txt')), 'accounts': bz.Data('static/accounts.json'), } # Import some JSON data for JSON pointer example with open('static/gh.json','r') as f: data['github'] = json.load(f) # Dataframe for Bokeh demo datademo = bz.odo( data['iris'], pd.DataFrame ) source = ColumnDataSource( datademo ) with open( 'app.yaml') as f: config = yaml.load( f, Loader=yaml.BaseLoader ) def keys(): """ Return dataframe keys """ # return json.dumps( datademo.columns.values.tolist() ) def dataframe_html(): """
def test_rseqc_glob(rseqc_read_distribution, rseqc_read_distribution2): df = odo( os.path.join(os.path.dirname(os.path.dirname(str(rseqc_read_distribution))), "*/*distribution.txt"), DataFrame ) assert df.shape == (20, 3)
def test_hist_metrics(insert_metrics): metrics = odo(str(insert_metrics), DataFrame) hist = odo(str(insert_metrics), DataFrame, key="hist") assert all(metrics["MEDIAN_INSERT_SIZE"] == [156]) assert all(hist["insert_size"] == [70,76,78])
def test_star_log(star_data): df = odo(str(star_data), DataFrame) assert df.loc["% of reads unmapped: too short","value"] == 8.73 assert df.loc["Uniquely mapped reads number","value"] == 4011114
def test_cutadapt_se(cutadapt_se_data): df = odo(str(cutadapt_se_data), DataFrame) assert df.loc["Reads with adapters"]["value"] == 54
def test_xls(xlsdata): df = odo(str(xlsdata), DataFrame) assert all(df["Position"] == range(0, 7))
def test_annotate_df(dataframe1, dataframe2): df1 = odo(str(dataframe1), DataFrame, annotate=True) df2 = df1.append(odo(str(dataframe2), DataFrame, annotate=True)) assert set([os.path.basename(x) for x in df2['uri']]) == {'sample1.dataframe1.csv', 'sample2.dataframe2.csv'}
from pipeline import * from operations import * from features import FEATURE_CLASSES translations_and_scores = pump(scores='data/hy.basic.scores.csv') q_features = multiply(translations_and_scores, 'q', *FEATURE_CLASSES) r_features = multiply(translations_and_scores, 'r', *FEATURE_CLASSES) feature_deltas = delta(q_features, 'q', r_features, 'r', *FEATURE_CLASSES) # delta is not commutative, so consider also the reverse: # delta(r_features, q_features, *FEATURE_CLASSES) #print translations_and_scores #print q_features #print r_features #print feature_deltas from blaze import merge, odo everything = merge(translations_and_scores, feature_deltas) odo(everything, 'data/features.csv')
z = np.arange(10) z_hdf5 = xuan.create_dataset("z",data = z ) # if you want to get the content under the 'folder': [i for i in f.items()] # if you want to drop a folder/dataset : del f['xuan'] # 关于odo import blaze as bz import pandas as pd x= bz.data("test.csv") x = bz.odo(x,pd.DataFrame) # 关于dask:一个强大的数据处理模块 # 1 array 方法 ## create and store # generate test data z=np.arange(2e4) z = z.reshape((int(1e4),2)) # this will rise an memory error #z = z.dot(z.transpose())
def test_fastqc_fq(fastqc_fq_summary_data): df = odo(str(fastqc_fq_summary_data), DataFrame) assert 1 == 1 print(df)
# generate train and test(i.e validation) files for caffe finetuning import os import sys import numpy as np import blaze as bz import glob import random data_dir = os.environ['DATA_DIR'] # train and test images and lable csvs train_csv = bz.Data(data_dir + 'diabetic_ret/trainLabels.csv') train = [] for img_name, label in train_csv: train.append(data_dir + 'diabetic_ret/train_resized/' + img_name + '.jpeg ' + str(label)) random.shuffle(train) bz.odo(train[len(train) / 5:], 'train.txt', sep='/n') bz.odo(train[:len(train) / 5], 'val.txt', sep='/n')
def test_cutadapt_pe(cutadapt_pe_data): df = odo(str(cutadapt_pe_data), DataFrame) assert df.loc["Read 1 with adapter"]["value"] == 54 assert list(df.loc["Read 1"]["value"]) == [76076, 4930, 66777]
def from_blaze(cls, filename, date_col='Date', value_col='Close'): df = bz.odo(filename, pd.DataFrame)[[date_col, value_col]] #[1000:1100] df = df.rename(columns = {value_col: 'Value'}) ts = df.set_index(date_col) return cls(ts)
def test_xls(xlsdata): df = odo(str(xlsdata), DataFrame) assert all(df["Position"] == range(0,7))
def test_rseqc_read_distribution(rseqc_read_distribution): df = odo(str(rseqc_read_distribution), DataFrame) assert "TES_down_10kb" in df.index assert df.loc["Introns", "Tag_count"] == 2022848