Example #1
0
def test_pivot_no_column(dataframe1):
    with pytest.raises(ValueError):
        odo(str(dataframe1),
            DataFrame,
            values="bar",
            annotate=True,
            index="uri")
Example #2
0
def test_custom_annotate_df(dataframe1, dataframe2):
    def _annotate_fn(df, uri, **kwargs):
        uristr = os.path.basename(uri)
        df['sample'] = uristr.split(".")[0]
        return df
    
    df1 = odo(str(dataframe1), DataFrame, annotate=True, annotation_fn=_annotate_fn)
    df2 = df1.append(odo(str(dataframe2), DataFrame, annotate=True, annotation_fn=_annotate_fn))
    assert set(df2['sample']) == {'sample1', 'sample2'}
Example #3
0
def test_annotate_df_regex(dataframe1, dataframe2):
    df1 = odo(str(dataframe1),
              DataFrame,
              annotate=True,
              regex=".*(?P<sample>sample\d+)\.(?P<df>[a-z]+\d+)\.csv")
    df2 = df1.append(
        odo(str(dataframe2),
            DataFrame,
            annotate=True,
            regex=".*(?P<sample>sample\d+)\.(?P<df>[a-z]+\d+)\.csv"))
    assert set(df2['sample'] == {'sample1', 'sample2'})
    assert set(df2['df'] == {'dataframe1', 'dataframe2'})
Example #4
0
def test_custom_annotate_df(dataframe1, dataframe2):
    def _annotate_fn(df, uri, **kwargs):
        uristr = os.path.basename(uri)
        df['sample'] = uristr.split(".")[0]
        return df

    df1 = odo(str(dataframe1),
              DataFrame,
              annotate=True,
              annotation_fn=_annotate_fn)
    df2 = df1.append(
        odo(str(dataframe2),
            DataFrame,
            annotate=True,
            annotation_fn=_annotate_fn))
    assert set(df2['sample']) == {'sample1', 'sample2'}
Example #5
0
def test_pivot_no_value(dataframe1):
    df = odo(str(dataframe1),
             DataFrame,
             columns="bar",
             annotate=True,
             index="uri")
    assert df.index.name == "uri"
Example #6
0
 def __init__(self):
     super(StockModel, self).__init__()
     file_name = "notebooks/db2.bcolz"
     self.df = bz.odo(file_name, pd.DataFrame)[['Date',
                                                'Close']]  #[1000:1100]
     self.devol()
     self.returns_df = None
Example #7
0
def bokeh_constructor( loader, node ):
    """
    build a bokeh plot
    """
    global workspace
    args = loader.construct_mapping(node, deep=True)
    args = resolve_pointer( workspace, args )

    source = None

    if not 'figure' in args:
        args['figure'] = {}

    args['figure'] = resolve_pointer( workspace, args['figure'] )
    if 'source' in args:
        source = blaze.odo( args['source'], ColumnDataSource )

    p = figure( **args['figure'] )

    for glyph, kwargs in yaml_to_args(args['glyphs']):
        if source:
            kwargs['source'] = source
        getattr( p, glyph )( **kwargs )

    return p
Example #8
0
def test_pivot_regex(dataframe1):
    df = odo(str(dataframe1),
             DataFrame,
             columns="foo",
             values="bar",
             index="sample",
             regex=".*(?P<sample>sample\d+)\.(?P<df>[a-z]+\d+)\.csv")
    assert list(df.index) == ["sample1"]
Example #9
0
def show_table():
    global datademo, source
    if request.args.get('data'):
        datademo = pd.read_csv( request.args.get('data') )
    else:
        datademo = bz.odo( data['iris'], pd.DataFrame )

    source = ColumnDataSource( datademo )
    return render_template( 'layout.html')
Example #10
0
def test_pivot_uri(dataframe1):
    df = odo(str(dataframe1),
             DataFrame,
             columns="foo",
             values="bar",
             index="uri",
             annotate=True)
    assert df.shape == (1, 2)
    assert list(df.columns) == [1, 3]
    assert df.index.name == "uri"
Example #11
0
def tail(table, n=10, to_frame=False):
    """
    Extract the end of a table.

    Args:
        n (int): Number of entries to extract
        to_frame (bool): Return a dataframe (default false)

    Returns:
        sliced: Blaze data object or pandas dataframe
    """
    sliced = db[table].sort().tail(n)
    if to_frame:
        return bz.odo(sliced, pd.DataFrame)
    return sliced
Example #12
0
def show_dataframe( data_key ):
    """
    JSON pointer to access bz.Data
    """
    convert = jp.resolve_pointer(
        data,    '/'+ data_key.rstrip('/')
    )

    if data_key.split('/')[0] in ['iris','accounts']:
        # Convert blaze object to html representation
        return bz.odo(
            convert , pd.DataFrame
        ).to_html()
    else:
        # Return YAML string
        return json.dumps(convert)
Example #13
0
def annotate_df(infile, parser, groupnames=["SM"]):
    """Annotate a parsed odo unit.
    
    Assumes metadata information is stored in input file name.

    Args:
      infile (str): file name
      parser (re): regexp object to parse input file name with. Metadata information to parse is stored in file name
     
      groupnames (list): list of parser group names to use. For each
      name <name>, the parser should have a corresponding (?P<name>...)
      expression
    """
    df = odo(infile, pd.DataFrame)
    m = parser.parse(infile)
    for name in groupnames:
        df[name] = str(m[name])
    return df
Example #14
0
    def aggregate(self, key=None, **kwargs):
        """Aggregate targets to aggregate_targets, if present.

        Args:
          key (str): aggregate targets for key, if None all targets are aggregated
          kwargs (dict): additional arguments, passed to *all* hooks

        """

        if not self.run:
            return self
        for k,v in self.iotargets.items():
            smllogger.debug("Aggregating key {}, iotargets: {}".format(k, v))
            if not key is None and not key == k:
                continue
            if self.iotargets[k][1] is None:
                smllogger.debug("Skipping iotarget key ", k)
                continue
            annotate = self._annotate
            if not self._annotation_funcs.get(k, None) is None:
                smllogger.debug("Annotating data")
                annotate = True
            try:
                dflist = [
                    odo(x, pd.DataFrame,
                        annotate=annotate,
                        annotation_fn=self._annotation_funcs.get(k, None), key=k, **kwargs) for x in self.targets[k]
                ]
            except:
                smllogger.warn("Unable to generate data frame list; not aggregating results for targets {}".format(self.targets[k]))
                return self
            # Run post-processing hooks, if any
            if not self._post_processing_hooks.get(k, None) is None:
                smllogger.debug("Running post processing hook")
                dflist = [self._post_processing_hooks[k](df, **kwargs) for df in dflist]
            df = pd.concat(dflist)
            # Run post-processing hook for aggregated data, if any
            if not self._aggregate_post_processing_hooks.get(k, None) is None:
                smllogger.debug("Running post processing hook on aggregated data")
                df = self._aggregate_post_processing_hooks[k](df, **kwargs)
            self._aggregate_data[k] = df
        return self
Example #15
0
def annotate_df(infile, parser, groupnames=["SM"]):
    """Annotate a parsed odo unit.

    Assumes metadata information is stored in input file name.

    Args:
      infile (str): file name
      parser (re): regexp object to parse input file name with.
                   Metadata information to parse is stored in file name

      groupnames (list): list of parser group names to use. For each
                         name <name>, the parser should have a
                         corresponding (?P<name>...) expression

    """
    df = odo(infile, pd.DataFrame)
    m = parser.parse(infile)
    for name in groupnames:
        df[name] = str(m[name])
    return df
Example #16
0
    def _blaze(self, _selects, _wheres, _groups, _aggs, _offset, _limit,
               _sorts, _count, _q):
        import blaze as bz
        import datashape
        # TODO: Not caching blaze connections
        parameters = self.params.get('parameters', {})
        bzcon = bz.Data(
            self.params['url'] +
            ('::' + self.params['table'] if self.params.get('table') else ''),
            **parameters)
        table = bz.Symbol('table', bzcon.dshape)
        columns = table.fields
        query = table

        if _wheres:
            wh_re = re.compile(r'([^=><~!]+)([=><~!]{1,2})([\s\S]+)')
            wheres = None
            for where in _wheres:
                match = wh_re.search(where)
                if match is None:
                    continue
                col, oper, val = match.groups()
                col = table[col]
                if oper in ['==', '=']:
                    whr = (col == val)
                elif oper == '>=':
                    whr = (col >= val)
                elif oper == '<=':
                    whr = (col <= val)
                elif oper == '>':
                    whr = (col > val)
                elif oper == '<':
                    whr = (col < val)
                elif oper == '!=':
                    whr = (col != val)
                elif oper == '~':
                    whr = (col.like('*' + val + '*'))
                elif oper == '!~':
                    whr = (~col.like('*' + val + '*'))
                wheres = whr if wheres is None else wheres & whr
            query = query if wheres is None else query[wheres]

        alias_cols = []
        if _groups and _aggs:
            byaggs = {
                'min': bz.min,
                'max': bz.max,
                'sum': bz.sum,
                'count': bz.count,
                'mean': bz.mean,
                'nunique': bz.nunique
            }
            agg_re = re.compile(r'([^:]+):([aA-zZ]+)\(([^:]+)\)')
            grps = bz.merge(*[query[group] for group in _groups])
            aggs = {}
            for agg in _aggs:
                match = agg_re.search(agg)
                if match is None:
                    continue
                name, oper, col = match.groups()
                alias_cols.append(name)
                aggs[name] = byaggs[oper](query[col])
            query = bz.by(grps, **aggs)

        if _q:
            wheres = None
            for col in columns:
                if isinstance(table[col].dshape.measure.ty,
                              datashape.coretypes.String):
                    whr = table[col].like('*' + _q + '*')
                    wheres = whr if wheres is None else wheres | whr
            if wheres is not None:
                query = query[wheres]

        count_query = query.count()

        if _sorts:
            order = {'asc': True, 'desc': False}
            sorts = []
            for sort in _sorts:
                col, odr = sort.partition(':')[::2]
                if col not in columns + alias_cols:
                    continue
                sorts.append(col)
            if sorts:
                query = query.sort(sorts, ascending=order.get(odr, True))

        if _offset:
            _offset = int(_offset)
        if _limit:
            _limit = int(_limit)
        if _offset and _limit:
            _limit += _offset
        if _offset or _limit:
            query = query[_offset:_limit]

        if _selects:
            query = query[_selects]

        # TODO: Improve json, csv, html outputs using native odo
        result = {
            'query': query,
            'data': bz.odo(bz.compute(query, bzcon.data), pd.DataFrame),
        }
        if _count:
            count = bz.odo(bz.compute(count_query, bzcon.data), pd.DataFrame)
            result['count'] = count.iloc[0, 0]
        return result
Example #17
0
    xg_val = xgb.DMatrix(val_X, val_Y)

    # setup parameters for xgboost
    param = {}
    # use softmax multi-class classification
    param['objective'] = 'multi:softmax'
    # scale weight of positive examples
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['nthread'] = 4
    param['num_class'] = 5

    num_round = 5
    bst = xgb.train(param, xg_train, num_round)

    # get prediction
    val_pred = bst.predict(xg_val)

    # get metrics
    score = accuracy_score(val_Y, val_pred)

    # predict test_set and save as csv
    run_test = True
    if run_test:
        xg_test = xgb.DMatrix(test_features)
        test_labels = bst.predict(xg_test)
        test_labels_df = bz.Data(zip(test_imgs, np.int64(test_labels)),
                                 fields=['image', 'level'])
        bz.odo(test_labels_df, test_result)
Example #18
0
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 09 13:44:53 2017

@author: Sriva
"""

import blaze as bz
#csv_path = 'C:/Users/Sriva/Desktop/George Mason/Spring 2017/DAEN 690/database/GNDTNDR.csv'
#bz.odo(csv_path, 'sqlite:///data.db::data')
csv_path = 'C:/Users/Sriva/Desktop/George Mason/Spring 2017/DAEN 690/database/GNDITEM.csv'
bz.odo(csv_path, 'sqlite:///data.db::gnditem')
Example #19
0
def test_qualimap_2_1_3(qualimap_data_2_1_3):
    df = odo(str(qualimap_data_2_1_3), DataFrame, key='Coverage_per_contig')
    assert list(df.columns) == ['chrlen', 'mapped_bases', 'mean_coverage', 'sd']
    assert list(df.index) == ['chr11']
Example #20
0
Data contains the data""",
                    action="store_true",
                    required=False)
parser.add_argument('source', metavar="SOURCE", type=str,
                    help='The source data to show',)

if __name__ == "__main__":
    args = parser.parse_args()
    kwargs = {}
    for arg in args.args:
        if "=" in arg:
            key, value = arg.split("=")
            if value.startswith("int:"):
                value = int(value[len("int:"):])
            kwargs[key] = value
        else:
            kwargs[arg] = True
    data = blaze.odo(
        args.source,
        pandas.DataFrame,
        **kwargs
    )
    if args.ipython:
        import IPython
        dict_ = globals()
        dict_.update(locals())
        IPython.start_ipython(argv=[], user_ns=dict_)

    data.plot()
    plt.show()
Example #21
0
def test_annotate_df(dataframe1, dataframe2):
    df1 = odo(str(dataframe1), DataFrame, annotate=True)
    df2 = df1.append(odo(str(dataframe2), DataFrame, annotate=True))
    assert set([os.path.basename(x) for x in df2['uri']]) == \
        {'sample1.dataframe1.csv', 'sample2.dataframe2.csv'}
Example #22
0
def test_metrics(align_metrics):
    metrics = odo(str(align_metrics), DataFrame)
    assert metrics.loc["FIRST_OF_PAIR"]["MEAN_READ_LENGTH"] == 76
Example #23
0
from bokeh.models import ColumnDataSource
from bokeh.resources import CDN
from bokeh.protocol import serialize_json

# Import interactive blaze data
data = {
    'iris': bz.Data(bz.CSV('static/iris.data.txt')),
    'accounts': bz.Data('static/accounts.json'),
}

# Import some JSON data for JSON pointer example
with open('static/gh.json','r') as f:
    data['github'] = json.load(f)

# Dataframe for Bokeh demo
datademo = bz.odo( data['iris'], pd.DataFrame )
source = ColumnDataSource( datademo )


with open( 'app.yaml') as f:
    config = yaml.load( f, Loader=yaml.BaseLoader )

def keys():
    """
    Return dataframe keys
    """
    #
    return json.dumps( datademo.columns.values.tolist() )

def dataframe_html():
    """
Example #24
0
def test_rseqc_glob(rseqc_read_distribution, rseqc_read_distribution2):
    df = odo(
        os.path.join(os.path.dirname(os.path.dirname(str(rseqc_read_distribution))), "*/*distribution.txt"), DataFrame
    )
    assert df.shape == (20, 3)
Example #25
0
def test_hist_metrics(insert_metrics):
    metrics = odo(str(insert_metrics), DataFrame)
    hist = odo(str(insert_metrics), DataFrame, key="hist")
    assert all(metrics["MEDIAN_INSERT_SIZE"] == [156])
    assert all(hist["insert_size"] == [70,76,78])
Example #26
0
def test_star_log(star_data):
    df = odo(str(star_data), DataFrame)
    assert df.loc["% of reads unmapped: too short","value"] == 8.73
    assert df.loc["Uniquely mapped reads number","value"] == 4011114
Example #27
0
def test_cutadapt_se(cutadapt_se_data):
    df = odo(str(cutadapt_se_data), DataFrame)
    assert df.loc["Reads with adapters"]["value"] == 54
Example #28
0
def test_xls(xlsdata):
    df = odo(str(xlsdata), DataFrame)
    assert all(df["Position"] == range(0, 7))
Example #29
0
def test_annotate_df(dataframe1, dataframe2):
    df1 = odo(str(dataframe1), DataFrame, annotate=True)
    df2 = df1.append(odo(str(dataframe2), DataFrame, annotate=True))
    assert set([os.path.basename(x) for x in df2['uri']]) == {'sample1.dataframe1.csv', 'sample2.dataframe2.csv'}
Example #30
0
from pipeline import *
from operations import *
from features import FEATURE_CLASSES


translations_and_scores = pump(scores='data/hy.basic.scores.csv')


q_features = multiply(translations_and_scores, 'q', *FEATURE_CLASSES)

r_features = multiply(translations_and_scores, 'r', *FEATURE_CLASSES)

feature_deltas = delta(q_features, 'q', r_features, 'r', *FEATURE_CLASSES)
# delta is not commutative, so consider also the reverse:
#   delta(r_features, q_features, *FEATURE_CLASSES)


#print translations_and_scores
#print q_features
#print r_features
#print feature_deltas

from blaze import merge, odo
everything = merge(translations_and_scores, feature_deltas)
odo(everything, 'data/features.csv')
Example #31
0
z = np.arange(10)
z_hdf5 = xuan.create_dataset("z",data = z  )

# if you want to get the content under the 'folder':
[i for  i in f.items()]

# if you want to drop a folder/dataset :

del f['xuan']


# 关于odo
import blaze as bz
import pandas as pd
x= bz.data("test.csv")
x = bz.odo(x,pd.DataFrame)



# 关于dask:一个强大的数据处理模块

# 1 array 方法
## create and store

# generate test data
z=np.arange(2e4)
z = z.reshape((int(1e4),2))

# this will rise an  memory error
#z = z.dot(z.transpose())
def test_fastqc_fq(fastqc_fq_summary_data):
    df = odo(str(fastqc_fq_summary_data), DataFrame)
    assert 1 == 1
    print(df)
Example #33
0
# generate train and test(i.e validation) files for caffe finetuning

import os
import sys
import numpy as np
import blaze as bz
import glob
import random

data_dir = os.environ['DATA_DIR']

# train and test images and lable csvs
train_csv = bz.Data(data_dir + 'diabetic_ret/trainLabels.csv')

train = []
for img_name, label in train_csv:
    train.append(data_dir + 'diabetic_ret/train_resized/' + img_name +
                 '.jpeg ' + str(label))

random.shuffle(train)

bz.odo(train[len(train) / 5:], 'train.txt', sep='/n')
bz.odo(train[:len(train) / 5], 'val.txt', sep='/n')
 def __init__(self):
     super(StockModel, self).__init__()
     file_name = "notebooks/db2.bcolz"
     self.df = bz.odo(file_name, pd.DataFrame)[['Date', 'Close']] #[1000:1100]
     self.devol()
     self.returns_df = None
Example #35
0
def test_cutadapt_pe(cutadapt_pe_data):
    df = odo(str(cutadapt_pe_data), DataFrame)
    assert df.loc["Read 1 with adapter"]["value"] == 54
    assert list(df.loc["Read 1"]["value"]) == [76076, 4930, 66777]
Example #36
0
 def from_blaze(cls, filename, date_col='Date', value_col='Close'):
     df = bz.odo(filename, pd.DataFrame)[[date_col, value_col]] #[1000:1100]
     df = df.rename(columns = {value_col: 'Value'})
     ts = df.set_index(date_col)
     return cls(ts)
Example #37
0
def test_xls(xlsdata):
    df = odo(str(xlsdata), DataFrame)
    assert all(df["Position"] == range(0,7))
Example #38
0
def test_rseqc_read_distribution(rseqc_read_distribution):
    df = odo(str(rseqc_read_distribution), DataFrame)
    assert "TES_down_10kb" in df.index
    assert df.loc["Introns", "Tag_count"] == 2022848