Exemple #1
0
def read_csv(fn, *args, **kwargs):
    chunkbytes = kwargs.pop('chunkbytes', 2**25)  # 50 MB
    categorize = kwargs.pop('categorize', None)
    index = kwargs.pop('index', None)
    if index and categorize == None:
        categorize = True

    kwargs = fill_kwargs(fn, args, kwargs)

    # Handle glob strings
    if '*' in fn:
        return concat([read_csv(f, *args, **kwargs) for f in sorted(glob(fn))])

    token = tokenize(os.path.getmtime(fn), args, kwargs)
    name = 'read-csv-%s-%s' % (fn, token)

    columns = kwargs.pop('columns')
    header = kwargs.pop('header')

    if 'nrows' in kwargs:  # Just create single partition
        dsk = {(name, 0): (apply, pd.read_csv, (fn,),
                                  assoc(kwargs, 'header', header))}
        result = DataFrame(dsk, name, columns, [None, None])

    else:
        # Chunk sizes and numbers
        total_bytes = file_size(fn, kwargs['compression'])
        nchunks = int(ceil(total_bytes / chunkbytes))
        divisions = [None] * (nchunks + 1)

        first_read_csv = partial(pd.read_csv, *args, header=header,
                               **dissoc(kwargs, 'compression'))
        rest_read_csv = partial(pd.read_csv, *args, header=None,
                              **dissoc(kwargs, 'compression'))

        # Create dask graph
        dsk = dict(((name, i), (rest_read_csv, (BytesIO,
                                   (textblock, fn,
                                       i*chunkbytes, (i+1) * chunkbytes,
                                       kwargs['compression']))))
                   for i in range(1, nchunks))
        dsk[(name, 0)] = (first_read_csv, (BytesIO,
                           (textblock, fn, 0, chunkbytes, kwargs['compression'])))

        result = DataFrame(dsk, name, columns, divisions)

    if categorize or index:
        categories, quantiles = categories_and_quantiles(fn, args, kwargs,
                                                         index, categorize,
                                                         chunkbytes=chunkbytes)

    if categorize:
        func = partial(categorize_block, categories=categories)
        result = result.map_partitions(func, columns=columns)

    if index:
        result = set_partition(result, index, quantiles)

    return result
Exemple #2
0
def test_get_params():
    d = from_sklearn(pipe1)
    params1 = d.get_params()
    params2 = pipe1.get_params()
    assert (dissoc(params1, 'steps', 'logistic',
                   'pca') == dissoc(params2, 'steps', 'logistic', 'pca'))
    params1 = d.get_params(deep=False)
    params2 = pipe1.get_params(deep=False)
    for dkstep, skstep in zip(params1['steps'], params2['steps']):
        # names are equivalent
        assert dkstep[0] == skstep[0]
        # ests have same params
        assert dkstep[1].get_params() == skstep[1].get_params()
Exemple #3
0
def read_csv(fn, *args, **kwargs):
    chunkbytes = kwargs.pop("chunkbytes", 2 ** 25)  # 50 MB
    categorize = kwargs.pop("categorize", None)
    index = kwargs.pop("index", None)
    if index and categorize == None:
        categorize = True

    kwargs = fill_kwargs(fn, args, kwargs)

    # Handle glob strings
    if "*" in fn:
        return concat([read_csv(f, *args, **kwargs) for f in sorted(glob(fn))])

    columns = kwargs.pop("columns")

    # Chunk sizes and numbers
    total_bytes = file_size(fn, kwargs["compression"])
    nchunks = int(ceil(total_bytes / chunkbytes))
    divisions = [None] * (nchunks + 1)

    header = kwargs.pop("header")

    first_read_csv = partial(pd.read_csv, *args, header=header, **dissoc(kwargs, "compression"))
    rest_read_csv = partial(pd.read_csv, *args, header=None, **dissoc(kwargs, "compression"))

    # Create dask graph
    name = "read-csv" + next(tokens)
    dsk = dict(
        (
            (name, i),
            (rest_read_csv, (BytesIO, (textblock, fn, i * chunkbytes, (i + 1) * chunkbytes, kwargs["compression"]))),
        )
        for i in range(1, nchunks)
    )
    dsk[(name, 0)] = (first_read_csv, (BytesIO, (textblock, fn, 0, chunkbytes, kwargs["compression"])))

    result = DataFrame(dsk, name, columns, divisions)

    if categorize or index:
        categories, quantiles = categories_and_quantiles(fn, args, kwargs, index, categorize, chunkbytes=chunkbytes)

    if categorize:
        func = partial(categorize_block, categories=categories)
        result = result.map_partitions(func, columns=columns)

    if index:
        result = set_partition(result, index, quantiles)

    return result
Exemple #4
0
def virginia_data(path):
    with open(path) as f:
        dat = (json.loads(l) for l in f)
        dat = (dissoc(d, '@context', '@type', 'jobLocation', 'baseSalary',
                      '_id') for d in dat)
        dat = list(dat)
    return dat
Exemple #5
0
def _remove_base_fee_if_none(block):
    """
    A `None` value is set for `base_fee_per_gas` during validation for blocks that do not have a
    base fee (pre-London blocks). Pop this value out here to normalize pre-London blocks.
    """
    return block if block['base_fee_per_gas'] else dissoc(
        block, 'base_fee_per_gas')
Exemple #6
0
def transform(filter_fn, transform_fn, data):
    partial_dict = toolz.valfilter(filter_fn, data)
    rest = toolz.dissoc(data, *partial_dict.keys())
    return toolz.merge(
        rest,
        toolz.valmap(transform_fn, partial_dict),
    )
Exemple #7
0
def doc2bytes(doc: Document_, purge_id: bool = False) -> Tuple[bytes, bytes]:
    """doc is

    Either:

    1. (id, {..}) key-value tuple
    2. {id: , ..}  dict with `id` field

    id has to be a UUID (typically string)
    """
    if isinstance(doc, tuple):
        k, d = doc
    else:
        k_ = doc.get("id")
        if not isinstance(k_, (str, UUID)):
            raise ValueError("Expect id key to be a string or a UUID")

        k = k_
        d = toolz.dissoc(doc, ["id"]) if purge_id else doc

    if not isinstance(k, UUID):
        if not isinstance(k, str):
            raise ValueError("Expect id key to be a string or a UUID")

        k = UUID(k)

    raw_k = k.bytes
    raw_doc = json.dumps(d, separators=(",", ":")).encode("utf8")
    return (raw_k, raw_doc)
Exemple #8
0
def remove_method(id, action):
    provider = Provider.get_by_id(id)
    if not provider:
        raise EntityNotFound('Provider')
    provider.methods = dissoc(provider.methods or {}, action)
    provider.put()
    return None
Exemple #9
0
def recreate_database(driver, params, **kwargs):
    url = sa.engine.url.URL(driver, **dissoc(params, 'database'))
    engine = sa.create_engine(url, **kwargs)

    with engine.connect() as conn:
        conn.execute('DROP DATABASE IF EXISTS {}'.format(params['database']))
        conn.execute('CREATE DATABASE {}'.format(params['database']))
Exemple #10
0
    def _step_prephysics(self) -> Diagnostics:

        if self._prephysics_stepper is None:
            diagnostics: Diagnostics = {}
        else:
            self._log_debug("Computing prephysics updates")
            _, diagnostics, state_updates = self._prephysics_stepper(
                self._state.time, self._state)
            if self._prephysics_only_diagnostic_ml:
                rename_diagnostics(diagnostics)
            else:
                self._state_updates.update(state_updates)
        state_updates = {
            k: v
            for k, v in self._state_updates.items()
            if k in PREPHYSICS_OVERRIDES
        }
        self._state_updates = dissoc(self._state_updates,
                                     *PREPHYSICS_OVERRIDES)
        self._log_debug(
            f"Applying prephysics state updates for: {list(state_updates.keys())}"
        )
        self._state.update_mass_conserving(state_updates)

        return diagnostics
Exemple #11
0
 def to_bars(node):
     children = node.get('children')
     if not children:
         return [node]
     new_children = concat(to_bars(child) for child in children)
     bar = dissoc(node, 'children')
     return list(concatv(new_children, [bar]))
Exemple #12
0
def recreate_database(driver, params, **kwargs):
    url = sa.engine.url.URL(driver, **dissoc(params, 'database'))
    engine = sa.create_engine(url, **kwargs)

    with engine.connect() as conn:
        conn.execute('DROP DATABASE IF EXISTS {}'.format(params['database']))
        conn.execute('CREATE DATABASE {}'.format(params['database']))
Exemple #13
0
def _merge_coverage(cnns, data):
    """Merge split CNN outputs into final consolidated output.
    """
    out = []
    for (out_file,
         _), members in tz.groupby(lambda x: (x["final_out"], x["bed_orig"]),
                                   cnns).items():
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                with open(tx_out_file, "w") as out_handle:
                    for i, in_file in enumerate([
                            x["file"]
                            for x in sorted(members, key=lambda x: x["bed_i"])
                    ]):
                        with open(in_file) as in_handle:
                            header = in_handle.readline()
                            if i == 0:
                                out_handle.write(header)
                            for line in in_handle:
                                out_handle.write(line)
        base = copy.deepcopy(members[0])
        base = tz.dissoc(base, "final_out", "bed_i", "bed_orig")
        base["file"] = out_file
        out.append(base)
    return out
Exemple #14
0
def _artifact_from_record(repo, record):
    if isinstance(record, Artifact):
        return record
    return Artifact(repo,
                    t.dissoc(record._asdict(), 'value', 'inputs', 'run_info'),
                    value=record.value, inputs=record.inputs,
                    run_info=record.run_info)
Exemple #15
0
def atomic_item_from_config(config, type_dict, item_plural, name=None):
    stype = config['type']
    if stype not in type_dict:
        raise Exception("{} may only be created of types: {}, you had {}".
                        format(item_plural, tuple(type_dict.keys()), stype))
    cls = type_dict[stype]
    kargs = t.dissoc(config, 'type')
    return cls(**kargs)
Exemple #16
0
def param_info(f):
    if is_curry_func(f):
        argspec = inspect.getfullargspec(f.func)
        num_args = len(f.args)
        args_to_remove = argspec.args[0:num_args] + list(f.keywords.keys())
        base = _func_param_info(argspec)
        return t.dissoc(base, *args_to_remove)
    return _func_param_info(inspect.getfullargspec(f))
Exemple #17
0
def get_df(conn_string):
    db = records.Database(conn_string)
    dat = db.query('select * from messages')
    dat = (r.as_dict() for r in dat)
    dat = ({**json.loads(r['content']), 'userid': r['userid']} for r in dat)
    dat = (dissoc(d, 'recipient', 'sender') for d in dat)
    dat = ({
        **d, 'text': get_in(['message', 'text'], d),
        'ref': get_ref(d)
    } for d in dat)
    dat = ({
        **d, 'event_type': get_in(['event', 'type'], d),
        'ref': get_ref(d)
    } for d in dat)
    dat = (dissoc(d, 'message', 'referral', 'user', 'page', 'postback',
                  'event', 'data') for d in dat)
    return pd.DataFrame(list(dat))
Exemple #18
0
def main(answer_file, truth_file, format, delimiter, response_field):
    if format == "csv":
        answer_reader = csv.reader(answer_file, delimiter=delimiter)
        truth_reader = csv.reader(truth_file, delimiter=delimiter)
    elif format == "csv-header":
        answer_reader = csv.DictReader(answer_file, delimiter=delimiter)
        truth_reader = csv.DictReader(truth_file, delimiter=delimiter)
    elif format == "json":
        answer_reader = map(json.loads, answer_file)
        truth_reader = map(json.loads, truth_file)

    answer_lines = [line for line in answer_reader]
    truth_lines = [line for line in truth_reader]

    if len(answer_lines) != len(truth_lines):
        print("{} has {} lines, but {} has {} lines.".format(
            answer_file.name, len(answer_lines), truth_file.name,
            len(truth_lines)))

    for answer, truth in zip(answer_lines, truth_lines):
        if format == "csv":
            answer_request = answer[:-1]
            answer_response = json.loads(answer[-1])
            truth_request = truth[:-1]
            truth_response = json.loads(truth[-1])
        elif format == "csv-header":
            answer_request = dissoc(answer, response_field)
            answer_response = json.loads(answer[response_field])
            truth_request = dissoc(truth, response_field)
            truth_response = json.loads(truth[response_field])
        elif format == "json":
            answer_request = dissoc(answer, response_field)
            answer_response = answer[response_field]
            truth_request = dissoc(truth, response_field)
            truth_response = truth[response_field]

        if answer_request != truth_request:
            print("{} is not equal to {}.".format(answer_request,
                                                  truth_request))

        if answer_response != truth_response:
            print("{} is not equal to {}.".format(answer_response,
                                                  truth_response))

    print("Done checking {} against {}.".format(answer_file.name,
                                                truth_file.name))
Exemple #19
0
    def get_account_history(self, index, limit, start=None, stop=None, order=-1, filter_by=None, raw_output=False):
        """ A generator over steemd.get_account_history.

        It offers serialization, filtering and fine grained iteration control.

        Args:
            index (int): start index for get_account_history
            limit (int): How many items are we interested in.
            start (int): (Optional) skip items until this index
            stop (int): (Optional) stop iteration early at this index
            order: (1, -1): 1 for chronological, -1 for reverse order
            filter_by (str, list): filter out all but these operations
            raw_output (bool): (Defaults to False). If True, return history in steemd format (unchanged).
        """
        history = self.steemd.get_account_history(self.name, index, limit)
        for item in history[::order]:
            index, event = item

            # start and stop utilities for chronological generator
            if start and index < start:
                continue

            if stop and index > stop:
                return

            op_type, op = event['op']
            block_props = dissoc(event, 'op')

            def construct_op(account_name):
                # verbatim output from steemd
                if raw_output:
                    return item

                # index can change during reindexing in
                # future hard-forks. Thus we cannot take it for granted.
                immutable = {
                    **op,
                    **block_props,
                    'account': account_name,
                    'type': op_type,
                }
                _id = Blockchain.hash_op(immutable)
                return {
                    **immutable,
                    '_id': _id,
                    'index': index,
                }

            if filter_by is None:
                yield construct_op(self.name)
            else:
                if type(filter_by) is list:
                    if op_type in filter_by:
                        yield construct_op(self.name)

                if type(filter_by) is str:
                    if op_type == filter_by:
                        yield construct_op(self.name)
Exemple #20
0
def column_run(model, prognostic, forcing,
               batch_dims=('x', 'y')):

    batch_dims = [dim for dim in batch_dims if dim in prognostic.dims]
    if batch_dims:
        prognostic = prognostic.stack(batch=batch_dims)
        forcing = forcing.stack(batch=batch_dims)

    prog = _dataset_to_dict(prognostic)
    forcing = _dataset_to_dict(forcing)

    prog.pop('p')
    w = prog.pop('w')

    z = Variable(torch.FloatTensor(prognostic.z.values))

    input_data = {
        'prognostic': prog,
        'forcing': forcing,
        'constant': {
            'w': w,
            'z': z
        }
    }

    model.eval()
    with torch.no_grad():
        y = model(input_data)

    coords = {'z': prognostic['z'], 'time': prognostic['time']}
    if 'batch' in prognostic.dims:
        coords['batch'] = prognostic.batch

    dims = ['time', 'batch', 'z']

    progs = {
        key: xr.DataArray(
            y['prognostic'][key].data.numpy(), coords=coords, dims=dims).unstack('batch')
        for key in y['prognostic']
    }

    coords['time'] = coords['time'][1:]

    # convert diagnostics to xarray
    diags_torch = y['diagnostic']
    diags = {
        key: xr.DataArray(diags_torch[key].data.numpy()[..., 0],
                          coords=dissoc(coords, 'z'),
                          dims=['time', 'batch'],
                          attrs=variable_attributes.get(key, None))
        .unstack('batch')
        for key in diags_torch
    }

    # add it to the dict of progs
    progs.update(diags)

    return xr.Dataset(progs)
Exemple #21
0
def _read_csv(path, schema, **kwargs):
    dtypes = dict(schema.to_pandas())

    dates = list(toolz.valfilter(lambda s: s == 'datetime64[ns]', dtypes))
    dtypes = toolz.dissoc(dtypes, *dates)

    return pd.read_csv(
        str(path), dtype=dtypes, parse_dates=dates, encoding='utf-8', **kwargs
    )
Exemple #22
0
 def process_node(node):
     children = node.get('children')
     if children:
         new_children = [process_node(child) for child in children]
         new_node = merge(node, {'children': new_children})
     else:
         new_node = merge(node, {'value': node['values'][index]})
     new_node = dissoc(new_node, 'values')
     return new_node
Exemple #23
0
def _dict_replace_processor(request_record,
                            response,
                            parse=identity,
                            request_field="request",
                            response_field="response"):
    return {
        response_field: parse(response),
        **dissoc(request_record, request_field)
    }
Exemple #24
0
def _read_csv(path, schema, **kwargs):
    dtypes = dict(schema.to_pandas())

    dates = list(toolz.valfilter(lambda s: s == 'datetime64[ns]', dtypes))
    dtypes = toolz.dissoc(dtypes, *dates)

    return pd.read_csv(
        str(path), dtype=dtypes, parse_dates=dates, encoding='utf-8', **kwargs
    )
Exemple #25
0
def plot_figure(data, **kwargs):
    from bokeh.plotting import ColumnDataSource, figure
    from bokeh.models import HoverTool

    if 'states' in data:
        data = toolz.dissoc(data, 'states')

    source = ColumnDataSource(data=data)

    fig = figure(tools='tap', **kwargs)
    r = fig.quad('left',
                 'right',
                 'top',
                 'bottom',
                 color='color',
                 line_color='black',
                 line_width=2,
                 source=source)

    r.selection_glyph = None
    r.nonselection_glyph = None

    hover = HoverTool(point_policy="follow_mouse",
                      tooltips="""
            <div>
                <span style="font-size: 14px; font-weight: bold;">Name:</span>&nbsp;
                <span style="font-size: 10px; font-family: Monaco, monospace;">@name</span>
            </div>
            <div>
                <span style="font-size: 14px; font-weight: bold;">Filename:</span>&nbsp;
                <span style="font-size: 10px; font-family: Monaco, monospace;">@filename</span>
            </div>
            <div>
                <span style="font-size: 14px; font-weight: bold;">Line number:</span>&nbsp;
                <span style="font-size: 10px; font-family: Monaco, monospace;">@line_number</span>
            </div>
            <div>
                <span style="font-size: 14px; font-weight: bold;">Line:</span>&nbsp;
                <span style="font-size: 10px; font-family: Monaco, monospace;">@line</span>
            </div>
            <div>
                <span style="font-size: 14px; font-weight: bold;">Time:</span>&nbsp;
                <span style="font-size: 10px; font-family: Monaco, monospace;">@time</span>
            </div>
            <div>
                <span style="font-size: 14px; font-weight: bold;">Percentage:</span>&nbsp;
                <span style="font-size: 10px; font-family: Monaco, monospace;">@width</span>
            </div>
            """)
    fig.add_tools(hover)

    fig.xaxis.visible = False
    fig.yaxis.visible = False
    fig.grid.visible = False

    return fig, source
def _make_access_list_txn(chain_id=131277322940537, access_list=[], **kwargs,):
    legacy_kwargs = dissoc(dict(**kwargs), "chain_id", "access_list")
    return merge(
        _make_legacy_txn(**legacy_kwargs),
        {
            "type": "0x1",
            "chain_id": chain_id,
            "access_list": access_list,
        }
    )
Exemple #27
0
def transaction_rlp_to_rpc_structure(dictionary: Dict[str, Any]) -> Dict[str, Any]:
    """
    Convert an rlp-structured transaction to a JSON-RPC-structured transaction.
    """
    access_list = dictionary.get('accessList')
    if access_list:
        dictionary = dissoc(dictionary, 'accessList')
        rpc_structured_access_list = _access_list_rlp_to_rpc_structure(access_list)
        dictionary = assoc(dictionary, 'accessList', rpc_structured_access_list)
    return dictionary
Exemple #28
0
def _param_info(f):
    if _is_curry_func(f):
        argspec = getargspec(f.func)
        num_args = len(f.args)
        args_to_remove = argspec.args[0:num_args]
        if f.keywords:
            args_to_remove += list(f.keywords.keys())
        base = _func_param_info(argspec)
        return t.dissoc(base, *args_to_remove)
    return (_func_param_info(getargspec(f)))
Exemple #29
0
    def put(self, wait: bool = True, force: bool = False):
        kwargs = {self.index_id_key: self.index_id, **self.processed_config}

        for unsupported_kwarg in ('GlobalSecondaryIndexes',
                                  'LocalSecondaryIndexes'):
            if unsupported_kwarg in kwargs:
                raise NotImplementedError(
                    '{} not supported for dynamodb tables'.format(
                        unsupported_kwarg))

        if self.exists:
            # SDK won't let you call UpdateTable with same ProvisionedThroughput.
            if all(self.boto3_resource().provisioned_throughput[kk] ==
                   kwargs.get('ProvisionedThroughput', {}).get(kk)
                   for kk in ('ReadCapacityUnits', 'WriteCapacityUnits')):
                filtered_kwargs = dissoc(kwargs, 'ProvisionedThroughput')
            else:
                filtered_kwargs = kwargs

            # SDK won't let you call UpdateTable without actually updating anything.
            try:
                apply_with_relevant_kwargs(self.service_client,
                                           self.service_client.update_table,
                                           filtered_kwargs)
            except botocore.exceptions.ClientError as err:
                if err.response['Error']['Code'] == 'ValidationException':
                    logger.info(
                        'Not updating because nothing to update; SDK message: {}'
                        .format(err.response['Error']['Message']))
                else:
                    raise

            apply_with_relevant_kwargs(self.service_client,
                                       self.service_client.tag_resource, {
                                           'ResourceArn': self.arn,
                                           **kwargs
                                       })
        else:
            try:
                logger.info(f'creating f{self.sdk_name}')
                self.create(wait=True)
                assert self.index_id
                assert self.exists
                apply_with_relevant_kwargs(self.service_client,
                                           self.service_client.tag_resource, {
                                               'ResourceArn': self.arn,
                                               **kwargs
                                           })
                logger.info(f'finished creating {self.sdk_name}')
            except self.service_client.exceptions.TableAlreadyExistsException:
                # this should never happen
                logger.error('possible race condition encountered')
                raise
Exemple #30
0
def remove_index_entry(index: Index, entry: str, id: Union[int, str]) -> Index:
    if entry is None:
        return index
    try:
        if len(index[entry]) == 1:
            return t.dissoc(index, entry)
        else:
            return t.update_in(index, [entry], lambda x: x - {id})
    except KeyError:
        error('''
            It seems the Index is corrupt. Please run 
            `mdn regenerate` and try again''')
        assert False  # just for mypy <3
def dashboard():
    '''Default view with box for adding measurements and list of five last
    entries.'''
    # request.form.last_added
    data = read_last_measurements() or None
    g.entry_model = entry_model

    form = MeasurementForm()
    if request.method == 'POST' and form.validate():
        add_measurement(dissoc(form.data, 'submit'))
        flash("Merkintä lisätty.")
        return redirect(url_for('measurements.dashboard'))

    return render_template('index.html', form=form, measurements=data)
    async def test_found(self, database, threat_factory):
        id_ = 1
        threat = threat_factory(id=id_)
        insert_threat(threat.dict())

        record = assoc(dissoc(threat.dict(), "id"), "threat_id", id_)
        insert_threat_record(record)

        result = await threat_repository.fetch_history_or_raise(id_)

        assert result
        assert result.threat_id == id_

        getter = itemgetter("danger_level", "location")
        assert getter(result.records[0].dict()) == getter(record)
Exemple #33
0
 def __str__(self):
     """
     String representation.
     """
     def to_output(t):
         if isinstance(t, Task):
             return t.kwargs['output']
         return t
     kwargs = {k: to_output(v) for k, v in self.kwargs.items()}
     kwargs = toolz.dissoc(kwargs, 'output')
     if self.args:
         args = [to_output(arg) for arg in self.args]
         return f'{self.name}({args}, {kwargs})'
     else:
         return f'{self.name}({kwargs})'
Exemple #34
0
def read_csv(fn, *args, **kwargs):
    if 'nrows' in kwargs:  # Just create single partition
        df = read_csv(fn, *args, **dissoc(kwargs, 'nrows'))
        return df.head(kwargs['nrows'], compute=False)

    chunkbytes = kwargs.pop('chunkbytes', 2**25)  # 50 MB
    index = kwargs.pop('index', None)
    kwargs = kwargs.copy()

    kwargs = fill_kwargs(fn, args, kwargs)

    # Handle glob strings
    if '*' in fn:
        from .multi import concat
        return concat([read_csv(f, *args, **kwargs) for f in sorted(glob(fn))])

    token = tokenize(os.path.getmtime(fn), args, kwargs)
    name = 'read-csv-%s-%s' % (fn, token)
    bom = get_bom(fn)

    columns = kwargs.pop('columns')
    header = kwargs.pop('header')

    # Chunk sizes and numbers
    total_bytes = file_size(fn, kwargs['compression'])
    nchunks = int(ceil(total_bytes / chunkbytes))
    divisions = [None] * (nchunks + 1)

    first_kwargs = merge(kwargs, dict(header=header, compression=None))
    rest_kwargs = merge(kwargs, dict(header=None, compression=None))

    # Create dask graph
    dsk = dict(((name, i), (_read_csv, fn, i, chunkbytes,
                                       kwargs['compression'], rest_kwargs,
                                       bom))
               for i in range(1, nchunks))

    dsk[(name, 0)] = (_read_csv, fn, 0, chunkbytes, kwargs['compression'],
                                 first_kwargs, b'')

    result = DataFrame(dsk, name, columns, divisions)

    if index:
        result = result.set_index(index)

    return result
def to_json(line):
    """Convert a line of json into a cleaned up dict."""
    # Convert timestamps into Timestamp objects
    date = line['created_utc']
    line['created_utc'] = Timestamp.utcfromtimestamp(int(date))
    edited = line['edited']
    line['edited'] = Timestamp.utcfromtimestamp(int(edited)) if edited else NaT

    # Convert deleted posts into `None`s (missing text data)
    if line['author'] == '[deleted]':
        line['author'] = None
    if line['body'] == '[deleted]':
        line['body'] = None

    # Remove 'id', and 'subreddit_id' as they're redundant
    # Remove 'retrieved_on' as it's irrelevant
    return dissoc(line, 'retrieved_on')
Exemple #36
0
def _merge_coverage(cnns, data):
    """Merge split CNN outputs into final consolidated output.
    """
    out = []
    for (out_file, _), members in tz.groupby(lambda x: (x["final_out"], x["bed_orig"]), cnns).items():
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                with open(tx_out_file, "w") as out_handle:
                    for i, in_file in enumerate([x["file"] for x in sorted(members, key=lambda x: x["bed_i"])]):
                        with open(in_file) as in_handle:
                            header = in_handle.readline()
                            if i == 0:
                                out_handle.write(header)
                            for line in in_handle:
                                out_handle.write(line)
        base = copy.deepcopy(members[0])
        base = tz.dissoc(base, "final_out", "bed_i", "bed_orig")
        base["file"] = out_file
        out.append(base)
    return out
Exemple #37
0
def workers(s):
    """ Information about workers

    Examples
    --------
    >>> workers(my_scheduler)  # doctest: +SKIP
    {'127.0.0.1': {'cores': 3,
                   'cpu': 0.0,
                   'last-seen': 0.003068,
                   'latency': 0.01584628690034151,
                   'ports': ['54871', '50943'],
                   'processing': {'inc': 2, 'add': 1},
                   'disk-read': 1234,
                   'disk-write': 1234,
                   'network-send': 1234,
                   'network-recv': 1234,
                   'memory': 16701911040,
                   'memory-percent': 85}}
    """
    hosts = {host: ['%s:%s' % (host, port) for port in d['ports']]
                for host, d in s.host_info.items()}

    processing = {host: countby(key_split, concat(s.processing[w] for w in addrs))
                  for host, addrs in hosts.items()}

    now = datetime.now()

    result = {}
    for host, info in s.host_info.items():
        info = dissoc(info, 'heartbeat', 'heartbeat-port')
        info['processing'] = processing[host]
        result[host] = info
        info['ports'] = list(info['ports'])
        if 'last-seen' in info:
            info['last-seen'] = (now - info['last-seen']).total_seconds()

    return result
def clean_blob(f):
    return json.dumps(dissoc(f, "id", "properties"))
Exemple #39
0
    )

    result = pd.Timestamp(result)
    expected = pd.Timestamp(result)
    if compare_nat_equal and pd.isnull(result) and pd.isnull(expected):
        return

    assert_equal.dispatch(object, object)(
        result,
        expected,
        path=path,
        **kwargs
    )


def assert_isidentical(result, expected, msg=''):
    assert result.isidentical(expected), (
        '%s%s is not identical to %s' % (_fmt_msg(msg), result, expected)
    )


try:
    # pull the dshape cases in
    from datashape.util.testing import assert_dshape_equal
except ImportError:
    pass
else:
    assert_equal.funcs.update(
        dissoc(assert_dshape_equal.funcs, (object, object)),
    )
Exemple #40
0
    expected = pd.Timestamp(result)
    if compare_nat_equal and pd.isnull(result) and pd.isnull(expected):
        return

    assert_equal.dispatch(object, object)(result, expected, path=path, **kwargs)


@assert_equal.register(slice, slice)
def assert_slice_equal(result, expected, path=(), msg=""):
    diff_start = (
        ("starts are not equal: %s != %s" % (result.start, result.stop)) if result.start != expected.start else ""
    )
    diff_stop = ("stops are not equal: %s != %s" % (result.stop, result.stop)) if result.stop != expected.stop else ""
    diff_step = ("steps are not equal: %s != %s" % (result.step, result.stop)) if result.step != expected.step else ""
    diffs = diff_start, diff_stop, diff_step

    assert not any(diffs), "%s%s\n%s" % (_fmt_msg(msg), "\n".join(filter(None, diffs)), _fmt_path(path))


def assert_isidentical(result, expected, msg=""):
    assert result.isidentical(expected), "%s%s is not identical to %s" % (_fmt_msg(msg), result, expected)


try:
    # pull the dshape cases in
    from datashape.util.testing import assert_dshape_equal
except ImportError:
    pass
else:
    assert_equal.funcs.update(dissoc(assert_dshape_equal.funcs, (object, object)))