def extract_and_capitalize_headlines_from_corpus(corpus_dir, docids):
    """
    Iterate through all the files in `corpus_dir`,
    extract the headlines, capitalized and return them
    
    Parameter:
    ---------------
    corpus_dir: string

    docids: list of string
        the document to be processed

    Return:
    --------------
    generator of (docid, headlines): (str, list<list<str>>)
    """
    get_tokens = partial(map, partial(get_in, ["token"]))
    get_features = partial(get_in, ["features"])

    make_capitalized_title_new = lambda words: make_capitalized_title(title_words=words)

    for docid in docids:
        p = Path(corpus_dir) / Path(docid)
        auxil_p = p.with_suffix(".auxil")
        paf_p = p.with_suffix(".paf")
        if auxil_p.exists() and paf_p.exists():
            try:
                titles, _ = separate_title_from_body(str(auxil_p), str(paf_p))
            except Exception as e:
                yield (e, None)
            # pipeline:
            # -> get features
            # -> get tokens
            # -> capitalize headline
            yield (None, (p.name, list(map(compose(make_capitalized_title_new, get_tokens, get_features), titles))))
Beispiel #2
0
def ccds_to_bed(ccds_stream):
  """Convert CCDS dump to Chanjo-style BED stream.

  Main entry point for default Chanjo converter (ccds). It converts
  a sorted (start, chrom) CCDS database to the Chanjo BED-format.

  Args:
    ccds_stream (file): file handle to read CCDS lines from

  Yields:
    Interval: interval with merged block and superblock ids
  """
  return pipe(
    ccds_stream,
    filter(grep('Public')),                    # filter out Public tx
    map(text_type.rstrip),                     # strip \n and spaces
    map(split(sep='\t')),                      # split into list
    map(extract_intervals),                    # convert to Interval
    concat,                                    # flatten
    map(rename_sex_interval),                  # rename sex contigs
    partial(lazy_groupby, key=attrgetter('contig')),  # group by contig
    pluck(1),                                  # extract second item
    map(groupby(attrgetter('name'))),          # non-lazy group by id
    map(valmap(merge_related_elements)),       # group intervals
    map(itervalues),                           # extract values
    map(partial(sorted, key=attrgetter('start'))),  # sort by start pos
    concat                                     # flatten
  )
Beispiel #3
0
def read_csv(fn, *args, **kwargs):
    chunksize = kwargs.pop('chunksize', 2**16)
    categorize = kwargs.pop('categorize', None)
    index = kwargs.pop('index', None)
    if index and categorize == None:
        categorize = True
    header = kwargs.get('header', 1)

    nlines = linecount(fn) - header
    nchunks = int(ceil(1.0 * nlines / chunksize))

    read = next(read_csv_names)

    blockdivs = tuple(range(chunksize, nlines, chunksize))

    one_chunk = pd.read_csv(fn, *args, nrows=100, **kwargs)

    cols = []

    if categorize or index:
        if categorize:
            category_columns = [c for c in one_chunk.dtypes.index
                                   if one_chunk.dtypes[c] == 'O']
        else:
            category_columns = []
        cols = category_columns + ([index] if index else [])
        d = read_csv(fn, *args, **merge(kwargs,
                                        dict(chunksize=chunksize,
                                             usecols=cols,
                                             categorize=False,
                                             parse_dates=None)))
        categories = [d[c].drop_duplicates() for c in category_columns]
        if index:
            quantiles = d[index].quantiles(np.linspace(0, 100, nchunks + 1)[1:-1])
            result = compute(quantiles, *categories)
            quantiles, categories = result[0], result[1:]
        else:
            categories = compute(*categories)
        categories = dict(zip(category_columns, categories))

    kwargs['chunksize'] = chunksize
    load = {(read, -1): (partial(pd.read_csv, *args, **kwargs), fn)}
    load.update(dict(((read, i), (get_chunk, (read, i-1), chunksize*i))
                     for i in range(nchunks)))

    name = next(names)

    dsk = dict(((name, i), (getitem, (read, i), 0))
                for i in range(nchunks))

    result = DataFrame(merge(dsk, load), name, one_chunk.columns, blockdivs)

    if categorize:
        func = partial(categorize_block, categories=categories)
        result = result.map_blocks(func, columns=result.columns)

    if index:
        result = set_partition(result, index, quantiles)

    return result
Beispiel #4
0
def test_to_tree_slice(serial):
    t = symbol('t', 'var * {name: string, amount: int32}')
    expr = t[:5]
    expr2 = pipe(expr,
                 partial(to_tree, names={t: 't'}),
                 serial.dumps,
                 serial.loads,
                 partial(from_tree, namespace={'t': t}))
    assert expr.isidentical(expr2)
def working_datetime_ranges_of_date(d,
                                    special_working_hours={},
                                    week_working_hours={},
                                    merge_tomorrow=True):
    """
    Returns a list of datetimes tuples (datetime_range),
    indicating contiguous working periods of given date, if merge_tomorrow
    check if first period of tomorrow is contiguous and merge
    with last of today.
    """

    # curried on working hours
    whs_by_date = partial(working_hours_of_date,
                          special_working_hours=special_working_hours,
                          week_working_hours=week_working_hours)
    # curried on date
    whs_to_dt_ranges = partial(working_hours_to_datetime_ranges, d)

    today_working_hours = whs_by_date(d)

    if not len(today_working_hours):
        return []

    if not merge_tomorrow:
        return whs_to_dt_ranges(today_working_hours)

    tomorrow_working_hours = whs_by_date(tomorrow(d))

    if are_working_hours_contiguous(today_working_hours,
                                    tomorrow_working_hours):
        # last range of today become a merged range between
        # the last of today and the first of tomorrow

        next_day = tomorrow(d)

        # when tomorrow working hour end at 00:00, certainly is (00:00, 00:00)
        # because is a contiguous with today working hours, in this case
        # we add a day to current date because end at 00:00 of day after
        # this cover 24/7 like situation
        if tomorrow_working_hours[0][1] == time(0):
            next_day = tomorrow(next_day)

        last_period = (
            datetime.combine(d, today_working_hours[-1][0]),
            datetime.combine(next_day, tomorrow_working_hours[0][1])
        )

        return whs_to_dt_ranges(today_working_hours[:-1]) + [last_period]

    return whs_to_dt_ranges(today_working_hours)
Beispiel #6
0
def get(dsk, keys, optimizations=[fuse], num_workers=cpu_count):
    """ Multiprocessed get function appropriate for Bags """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(psutil.cpu_count())
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = dill_apply_async(pool.apply_async)

    # Optimize Dask
    dsk2 = pipe(dsk, partial(cull, keys=keys), *optimizations)

    try:
        # Run
        result = get_async(apply_async, cpu_count, dsk2, keys,
                           queue=queue)
    finally:
        if cleanup:
            pool.close()
    return result
Beispiel #7
0
 def __getattr__(self, key):
     if key == '_hash':
         raise AttributeError()
     try:
         return _attr_cache[(self, key)]
     except:
         pass
     try:
         result = object.__getattribute__(self, key)
     except AttributeError:
         fields = dict(zip(map(valid_identifier, self.fields),
                           self.fields))
         if self.fields and key in fields:
             if isscalar(self.dshape.measure):  # t.foo.foo is t.foo
                 result = self
             else:
                 result = self[fields[key]]
         else:
             d = toolz.merge(schema_methods(self.dshape.measure),
                             dshape_methods(self.dshape))
             if key in d:
                 func = d[key]
                 if func in method_properties:
                     result = func(self)
                 else:
                     result = functools.update_wrapper(partial(func, self),
                                                       func)
             else:
                 raise
     _attr_cache[(self, key)] = result
     return result
Beispiel #8
0
def trim_internal(x, axes, boundary=None):
    """ Trim sides from each block

    This couples well with the overlap operation, which may leave excess data on
    each block

    See also
    --------
    dask.array.chunk.trim
    dask.array.map_blocks
    """
    boundary = coerce_boundary(x.ndim, boundary)

    olist = []
    for i, bd in enumerate(x.chunks):
        bdy = boundary.get(i, 'none')
        ilist = []
        for j, d in enumerate(bd):
            if bdy != 'none':
                d = d - axes.get(i, 0) * 2
            else:
                d = d - axes.get(i, 0) if j != 0 else d
                d = d - axes.get(i, 0) if j != len(bd) - 1 else d
            ilist.append(d)
        olist.append(tuple(ilist))

    chunks = tuple(olist)

    return map_blocks(partial(_trim, axes=axes, boundary=boundary),
                      x, chunks=chunks, dtype=x.dtype)
Beispiel #9
0
def test_inline_ignores_curries_and_partials():
    dsk = {'x': 1, 'y': 2,
           'a': (partial(add, 1), 'x'),
           'b': (inc, 'a')}

    result = inline_functions(dsk, fast_functions=set([add]))
    assert 'a' not in set(result.keys())
Beispiel #10
0
def ghost_internal(x, axes):
    """ Share boundaries between neighboring blocks

    Parameters
    ----------

    x: da.Array
        A dask array
    axes: dict
        The size of the shared boundary per axis

    The axes dict informs how many cells to overlap between neighboring blocks
    {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis
    """
    dims = list(map(len, x.blockdims))
    expand_key2 = partial(expand_key, dims=dims)
    interior_keys = pipe(x._keys(), flatten,
                                    map(expand_key2), map(flatten),
                                    concat, list)
    interior_slices = dict((k, fractional_slice(k, axes))
                            for k in interior_keys)

    shape = (3,) * x.ndim
    name = next(ghost_names)
    ghost_blocks = dict(((name,) + k[1:],
                         (rec_concatenate, (concrete, expand_key2(k))))
                        for k in interior_keys)

    blockdims = [  [bds[0] + axes.get(i, 0)]
                 + [bd + axes.get(i, 0) * 2 for bd in bds[1:-1]]
                 + [bds[-1] + axes.get(i, 0)]
                 for i, bds in enumerate(x.blockdims)]

    return Array(merge(interior_slices, ghost_blocks, x.dask),
                 name, blockdims=blockdims)
def is_date_a_fixed_closing_date(d, fixed_closing_days=[]):
    """
    Check if date is in given list of dates, does not look at year to
    compare.
    """
    return d in filter(None, map(partial(date_with_year, d.year),
                                 fixed_closing_days))
Beispiel #12
0
def format_results(terminal_width, key_list, separator, text_list,
                   left_align=True, min_factor=3, **kwargs):
    """Returns formatted results in two columns.
    """
    key_width = max(map(len, key_list))
    separator_length = len(separator)
    desc_wrap = toolz.identity
    if terminal_width:
        if key_width / terminal_width > .5:
            key_width = terminal_width // 2 - 3
        text_width = terminal_width - key_width - separator_length
        if text_width * min_factor > terminal_width:
            desc_wrap = toolz.compose(
                ('\n' + ' ' * (key_width + separator_length)).join,
                toolz.partial(textwrap.wrap, width=text_width, **kwargs),
            )

    if left_align:
        fmt = '%-*s%s%s'
    else:
        fmt = '%*s%s%s'

    for key, text in zip(key_list, text_list):
        text = desc_wrap(text)
        if len(key) > key_width:
            yield fmt % (key_width, key, separator, '')
            yield fmt % (key_width, '', ' ' * separator_length, text)
        else:
            yield fmt % (key_width, key, separator, text)
Beispiel #13
0
 def __getattr__(self, key):
     if key in dir(self._accessor):
         if isinstance(getattr(self._accessor, key), property):
             return self._property_map(key)
         else:
             return partial(self._function_map, key)
     else:
         raise AttributeError(key)
Beispiel #14
0
def hash_join(lhs, left_on, rhs, right_on, how="inner", npartitions=None, suffixes=("_x", "_y")):
    """ Join two DataFrames on particular columns with hash join

    This shuffles both datasets on the joined column and then performs an
    embarassingly parallel join partition-by-partition

    >>> hash_join(a, 'id', rhs, 'id', how='left', npartitions=10)  # doctest: +SKIP
    """
    if npartitions is None:
        npartitions = max(lhs.npartitions, rhs.npartitions)

    lhs2 = shuffle(lhs, left_on, npartitions)
    rhs2 = shuffle(rhs, right_on, npartitions)

    if isinstance(left_on, Index):
        left_on = None
        left_index = True
    else:
        left_index = False

    if isinstance(right_on, Index):
        right_on = None
        right_index = True
    else:
        right_index = False

    # dummy result
    dummy = pd.merge(
        lhs._pd,
        rhs._pd,
        how,
        None,
        left_on=left_on,
        right_on=right_on,
        left_index=left_index,
        right_index=right_index,
        suffixes=suffixes,
    )

    merger = partial(
        _pdmerge, suffixes=suffixes, default_left_columns=list(lhs.columns), default_right_columns=list(rhs.columns)
    )

    if isinstance(left_on, list):
        left_on = (list, tuple(left_on))
    if isinstance(right_on, list):
        right_on = (list, tuple(right_on))

    token = tokenize(lhs, left_on, rhs, right_on, left_index, right_index, how, npartitions, suffixes)
    name = "hash-join-" + token

    dsk = dict(
        ((name, i), (merger, (lhs2._name, i), (rhs2._name, i), how, left_on, right_on, left_index, right_index))
        for i in range(npartitions)
    )

    divisions = [None] * (npartitions + 1)
    return DataFrame(toolz.merge(lhs2.dask, rhs2.dask, dsk), name, dummy, divisions)
Beispiel #15
0
def test_get_with_dill():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        dsk = {'x': 1, 'y': (partial(add, 1), 'x')}
        keys = 'y'

        assert c.get(dsk, keys) == 2
        c.close()
Beispiel #16
0
def hash_join(lhs, left_on, rhs, right_on, how='inner',
              npartitions=None, suffixes=('_x', '_y'), shuffle=None, indicator=False):
    """ Join two DataFrames on particular columns with hash join

    This shuffles both datasets on the joined column and then performs an
    embarrassingly parallel join partition-by-partition

    >>> hash_join(a, 'id', rhs, 'id', how='left', npartitions=10)  # doctest: +SKIP
    """
    print ('started hash_join, indicator = ', indicator)
    if npartitions is None:
        npartitions = max(lhs.npartitions, rhs.npartitions)

    lhs2 = shuffle_func(lhs, left_on, npartitions=npartitions, shuffle=shuffle)
    rhs2 = shuffle_func(rhs, right_on, npartitions=npartitions, shuffle=shuffle)

    if isinstance(left_on, Index):
        left_on = None
        left_index = True
    else:
        left_index = False

    if isinstance(right_on, Index):
        right_on = None
        right_index = True
    else:
        right_index = False

    # dummy result
    meta = pd.merge(lhs._meta_nonempty, rhs._meta_nonempty, how, None,
                    left_on=left_on, right_on=right_on,
                    left_index=left_index, right_index=right_index,
                    suffixes=suffixes, indicator=indicator)

    merger = partial(_pdmerge, suffixes=suffixes,
                     default_left_columns=list(lhs.columns),
                     default_right_columns=list(rhs.columns), indicator=indicator)

    if isinstance(left_on, list):
        left_on = (list, tuple(left_on))
    if isinstance(right_on, list):
        right_on = (list, tuple(right_on))

    token = tokenize(lhs2, left_on, rhs2, right_on, left_index, right_index,
                     how, npartitions, suffixes, shuffle)
    name = 'hash-join-' + token

    dsk = dict(((name, i), (merger, (lhs2._name, i), (rhs2._name, i),
                            how, left_on, right_on,
                            left_index, right_index))
                for i in range(npartitions))

    divisions = [None] * (npartitions + 1)
    return DataFrame(toolz.merge(lhs2.dask, rhs2.dask, dsk),
                     name, meta, divisions)
Beispiel #17
0
 def __getattr__(self, key):
     try:
         return object.__getattribute__(self, key)
     except AttributeError:
         if key in dir(pd.Series.str):
             if isinstance(getattr(pd.Series.str, key), property):
                 return self._property_map(key)
             else:
                 return partial(self._function_map, key)
         else:
             raise
Beispiel #18
0
Datei: olt.py Projekt: sjava/olt
def zte_gpon_svlan_check():
    clear_log()
    nodes = graph.cypher.execute(
        "match(n:Olt)--(c:Card) where c.name='GTGO' return n.ip,collect(c.slot)")
    olts = ((x[0], x[1]) for x in nodes)
    lzte_gpon_svlan = lambda x: zte_gpon_svlan(ip=x[0], slots=x[1])
    pool = Pool(8)
    lock = Manager().Lock()
    func = partial(svlan_entry, lock)
    list(pool.map(compose(func, lzte_gpon_svlan), olts))
    pool.close()
    pool.join()
Beispiel #19
0
def test_normalize_function():
    def f1(a, b, c=1):
        pass
    def f2(a, b=1, c=2):
        pass
    def f3(a):
        pass

    assert normalize_function(f2)

    f = lambda a: a
    assert normalize_function(f)

    assert (normalize_function(partial(f2, b=2)) ==
            normalize_function(partial(f2, b=2)))

    assert (normalize_function(partial(f2, b=2)) !=
            normalize_function(partial(f2, b=3)))

    assert (normalize_function(partial(f1, b=2)) !=
            normalize_function(partial(f2, b=2)))

    assert (normalize_function(compose(f2, f3)) ==
            normalize_function(compose(f2, f3)))

    assert (normalize_function(compose(f2, f3)) !=
            normalize_function(compose(f2, f1)))

    assert normalize_function(curry(f2)) == normalize_function(curry(f2))
    assert normalize_function(curry(f2)) != normalize_function(curry(f1))
    assert (normalize_function(curry(f2, b=1)) ==
            normalize_function(curry(f2, b=1)))
    assert (normalize_function(curry(f2, b=1)) !=
            normalize_function(curry(f2, b=2)))
Beispiel #20
0
Datei: olt.py Projekt: sjava/olt
def card_check():
    clear_log()
    #  nodes = graph.find('Olt', property_key='ip', property_value='218.92.130.130')
    nodes = graph.find('Olt')
    #  nodes = graph.find('Olt', property_key='company', property_value='zte')
    olts = [(x['ip'], x['company'], x['area']) for x in nodes]
    #  list(map(compose(card_entry, get_card), olts))
    pool = multiprocessing.Pool(8)
    lock = multiprocessing.Manager().Lock()
    func = partial(card_entry_m, lock)
    list(pool.map(compose(func, get_card), olts))
    pool.close()
    pool.join()
Beispiel #21
0
def interface_check_m():
    clear_log()
    #  cmd = "match(s: Switch) where s.model in ['S8505','S8508'] return s.ip, s.model"
    cmd = "match(s: Switch)  return s.ip, s.model"
    #  cmd = "match(s:Switch) where s.model='S9306' or s.model='s9303' return s.ip,s.model limit 2"
    nodes = graph.cypher.execute(cmd)
    switchs = [(x[0], x[1]) for x in nodes]
    pool = Pool(16)
    lock = Manager().Lock()
    out_inf = partial(output_interface_m, lock)
    list(pool.map(compose(out_inf, get_interface), switchs))
    pool.close()
    pool.join()
Beispiel #22
0
Datei: olt.py Projekt: sjava/olt
def svlan_check():
    clear_log()
    #  nodes = graph.find('Olt', property_key='ip', property_value='9.192.96.246')
    nodes = graph.find('Olt')
    #  nodes = graph.find('Olt', property_key='company', property_value='zte')
    olts = [(x['ip'], x['company'], x['area']) for x in nodes]
    #  list(map(compose(card_entry, get_card), olts))
    pool = Pool(16)
    lock = Manager().Lock()
    func = partial(svlan_entry, lock)
    list(pool.map(compose(func, get_svlan), olts))
    pool.close()
    pool.join()
Beispiel #23
0
def trim_internal(x, axes=None):
    """ Trim sides from each block

    This couples well with the ghost operation, which may leave excess data on
    each block

    See also
        chunk.trim
        map_blocks
    """
    blockdims = tuple([tuple([d - axes.get(i, 0)*2 for d in bd])
                       for i, bd in enumerate(x.blockdims)])
    return map_blocks(x, partial(chunk.trim, axes=axes), blockdims=blockdims)
Beispiel #24
0
def run_features(args):
    """Run image feature computation.

    Parameters
    ----------
    args : argparse.Namespace
        The arguments parsed by the argparse library.
    """
    if args.global_threshold:
        images = map(io.imread, args.images)
        thresholds = pre.global_threshold(images, args.random_seed)
    else:
        thresholds = None
    images = map(io.imread, args.images)
    screen_info = screens.d[args.screen]
    index_function, fmap = screen_info['index'], screen_info['fmap']
    fmap = tz.partial(fmap, threshold=thresholds,
                            sample_size=args.sample_size,
                            random_seed=args.random_seed)
    indices = list(map(index_function, args.images))
    f0, feature_names = fmap(next(images))
    feature_vectors = tz.cons(f0, (fmap(im)[0] for im in images))
    online_scaler = StandardScaler()
    online_pca = cluster.OnlineIncrementalPCA(n_components=args.n_components,
                                              batch_size=args.pca_batch_size)
    nimages, nfeatures = len(args.images), len(f0)
    emit = io.emitter_function(args.emitter)
    with temporary_hdf5_dataset((nimages, nfeatures), 'float') as dset:
        # First pass: compute the features, compute the mean and SD,
        # compute the PCA
        for i, (idx, v) in enumerate(zip(indices, feature_vectors)):
            emit({'_id': idx, 'feature_vector': list(v)})
            dset[i] = v
            online_scaler.partial_fit(v.reshape(1, -1))
            online_pca.add_sample(v)
        # Second pass: standardise the feature vectors, compute PCA-transform
        for i, (idx, v) in enumerate(zip(indices, dset)):
            v_std = online_scaler.transform(v.reshape(1, -1))[0]
            v_pca = online_pca.transform(v)
            dset[i] = v_std
            emit({'_id': idx, 'feature_vector_std': list(v_std),
                              'pca_vector': list(v_pca)})
            online_pca.transform(v)
        # Third pass: Compute the nearest neighbors graph.
        # THIS ANNOYINGLY INSTANTIATES FULL ARRAY -- no out-of-core
        # solution that I'm aware of...
        ng = neighbors.kneighbors_graph(dset, args.num_neighbours,
                                        include_self=False, mode='distance')
        for idx, row in zip(indices, ng):
            emit({'_id': idx, 'neighbours': [indices[i] for i in row.indices]})
Beispiel #25
0
	def __init__(self, bamfile, outdir):
		self.bamfile = bamfile
		stat = self.indexbamfile()
		self.outdir = outdir
		assert self.bamfile and self.outdir and stat, "Input error"
		self._bam = pysam.Samfile(bamfile)
		self._prealloc_func = partial(np.zeros, dtype=np.int)
		self.fake_bed_rows = [("chrX", 1, 59373566), ("chrY", 69362, 11375310)]
		self.sequence = pipe(self.fake_bed_rows,
		                     map(lambda interval: self.depthreader(*interval)),
		                     map(average)
		                     )
		self.x_coverage, self.y_coverage = list(self.sequence)
		self.sex = self.predict_gender()
Beispiel #26
0
def overlap_internal(x, axes):
    """ Share boundaries between neighboring blocks

    Parameters
    ----------

    x: da.Array
        A dask array
    axes: dict
        The size of the shared boundary per axis

    The axes input informs how many cells to overlap between neighboring blocks
    {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis
    """
    dims = list(map(len, x.chunks))
    expand_key2 = partial(expand_key, dims=dims, axes=axes)

    # Make keys for each of the surrounding sub-arrays
    interior_keys = pipe(x.__dask_keys__(), flatten, map(expand_key2),
                         map(flatten), concat, list)

    name = 'overlap-' + tokenize(x, axes)
    getitem_name = 'getitem-' + tokenize(x, axes)
    interior_slices = {}
    overlap_blocks = {}
    for k in interior_keys:
        frac_slice = fractional_slice((x.name,) + k, axes)
        if (x.name,) + k != frac_slice:
            interior_slices[(getitem_name,) + k] = frac_slice
        else:
            interior_slices[(getitem_name,) + k] = (x.name,) + k
            overlap_blocks[(name,) + k] = (concatenate3,
                                           (concrete, expand_key2((None,) + k, name=getitem_name)))

    chunks = []
    for i, bds in enumerate(x.chunks):
        if len(bds) == 1:
            chunks.append(bds)
        else:
            left = [bds[0] + axes.get(i, 0)]
            right = [bds[-1] + axes.get(i, 0)]
            mid = []
            for bd in bds[1:-1]:
                mid.append(bd + axes.get(i, 0) * 2)
            chunks.append(left + mid + right)

    dsk = merge(interior_slices, overlap_blocks)
    dsk = sharedict.merge(x.dask, (name, dsk))

    return Array(dsk, name, chunks, dtype=x.dtype)
Beispiel #27
0
Datei: olt.py Projekt: sjava/olt
def hostname_check():
    clear_log()
    nodes = graph.find('Olt')
    #  nodes = graph.find('Olt', property_key='ip', property_value='172.18.0.46')
    olts = [(x['ip'], x['company']) for x in nodes]
    pool = Pool(16)
    lock = Manager().Lock()
    func = partial(hostname_entry, lock)
    list(pool.map(compose(func, get_hostname), olts))
    pool.close()
    pool.join()
    ip_hostname = (x.split(',') for x in open(result_file))
    cmd = "match (n:Olt) where n.ip={ip} set n.hostname={hostname}"
    list(map(lambda x: graph.cypher.execute(
        cmd, ip=x[0], hostname=x[1]), ip_hostname))
def get(dsk, keys, optimizations=[], num_workers=None,
        func_loads=None, func_dumps=None, **kwargs):
    """ Multiprocessed get function appropriate for Bags

    Parameters
    ----------

    dsk: dict
        dask graph
    keys: object or list
        Desired results from graph
    optimizations: list of functions
        optimizations to perform on graph before execution
    num_workers: int
        Number of worker processes (defaults to number of cores)
    func_dumps: function
        Function to use for function serialization
        (defaults to cloudpickle.dumps)
    func_loads: function
        Function to use for function deserialization
        (defaults to cloudpickle.loads)
    """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(num_workers)
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = pickle_apply_async(pool.apply_async,
                                          func_dumps=func_dumps,
                                          func_loads=func_loads)

    # Optimize Dask
    dsk2 = fuse(dsk, keys)
    dsk3 = pipe(dsk2, partial(cull, keys=keys), *optimizations)

    try:
        # Run
        result = get_async(apply_async, len(pool._pool), dsk3, keys,
                           queue=queue, get_id=_process_get_id, **kwargs)
    finally:
        if cleanup:
            pool.close()
    return result
Beispiel #29
0
def test_normalize_function():
    def f1(a, b, c=1):
        pass
    cf1 = curry(f1)
    def f2(a, b=1, c=2):
        pass
    def f3(a):
        pass
    assert normalize_function(f2) == str(f2)
    f = lambda a: a
    assert normalize_function(f) == str(f)
    comp = compose(partial(f2, b=2), f3)
    assert normalize_function(comp) == ((str(f2), (), (('b', 2),)), str(f3))
    assert normalize_function(cf1) == (str(f1), (), ())
    assert normalize_function(cf1(2, c=2)) == (str(f1), (2,), (('c', 2),))
    assert normalize_token(cf1) == normalize_function(cf1)
Beispiel #30
0
def predict(model, x):
    """ Predict with a scikit learn model

    Parameters
    ----------

    model: scikit learn classifier
    x: dask Array

    See docstring for ``da.learn.fit``
    """
    assert x.ndim == 2
    if len(x.chunks[1]) > 1:
        x = x.reblock(chunks=(x.chunks[0], sum(x.chunks[1])))
    func = partial(_predict, model)
    return x.map_blocks(func, chunks=(x.chunks[0], (1,))).squeeze()
Beispiel #31
0
import pytest
from toolz import partial

import dask
from dask import compute
from dask.compatibility import PY_VERSION
from dask.utils import filetexts
from dask.bytes import utils
from dask.bag.text import read_text
from fsspec.compression import compr

compute = partial(compute, scheduler="sync")


files = {
    ".test.accounts.1.json": (
        '{"amount": 100, "name": "Alice"}\n'
        '{"amount": 200, "name": "Bob"}\n'
        '{"amount": 300, "name": "Charlie"}\n'
        '{"amount": 400, "name": "Dennis"}\n'
    ),
    ".test.accounts.2.json": (
        '{"amount": 500, "name": "Alice"}\n'
        '{"amount": 600, "name": "Bob"}\n'
        '{"amount": 700, "name": "Charlie"}\n'
        '{"amount": 800, "name": "Dennis"}\n'
    ),
}


expected = "".join([files[v] for v in sorted(files)])
Beispiel #32
0
def get_pet_relations(pet):
    return compose(list,
                   partial(pluck,
                           "customer"))(frappe.get_all("Pet Relation",
                                                       filters={"parent": pet},
                                                       fields=["customer"]))
Beispiel #33
0
def extract(
    image,
    classifier=None,
    context=default_context,
    output_folder=None,
    return_negatives=False,
    override_prediction=False,
):
    """
    extract(image, classifier, context=default_context, output_folder=None, return_negatives=False, override_prediction=False)
    A function utilised the core of the package to extract required lines and by
    default classifies the required and non-required lines.

    Note
    ----
    Needs refactoring.

    Parameters
    ----------
    image : np.array or str
        image as loaded by ``cv2.imread`` or string path to the image on disk

    classifier : sklearn model or str
        sklearn model for classification or a string to a pickled model
        loads the last trained model.
        Current default model is loaded if nothing else is provided.

    context : dict
        parameter dictionary which contains default settings
        for various functions
        # TODO: Write better summary of how to use this

    output_folder : str
        if provided will save the predicted lines

    override_prediction : bool
        if ``True`` then it overwrites any filtering done by the model and turns this
        into a regular pipeline of getting just the subsets

    expand : dict
        experimental feature. This will eventually accept a dictionary of parameters
        which will be trickled down into the core making testing easier. at the moment
        we can only change the vertical padding of the system

    Returns
    -------
    list | tuple
        a list of cutout lines in numpy array form if ``return_negatives`` is disabled,
        else a tuple containing both positive predictions and negatives (1s and 0s)

    """
    # logic for classifier assessment
    if classifier:
        if isinstance(classifier, str):
            classifier = io.load_model(classifier)
        else:
            classifier = classifier
    else:
        classifier = io.load_model(
            resource_filename("readpyne", "models/classifier.pkl"))

    pipe = fp.compose(unfold_args(core.features),
                      fp.partial(core.boxes, context=context))

    if isinstance(image, str):
        pipe = fp.compose(pipe, io.load_validate)

    subsets, features = pipe(image)

    # return the subsets raw without doing any other work
    if override_prediction:
        print(
            "[WARN] You have chosen not to use the classifier and hence full list of lines is returned"
        )
        return subsets

    # Use the model to predict
    prediction = classifier.predict(features)

    # get the zero and non-zero indices
    bindices_zero = prediction == 0
    zeros = np.arange(len(prediction))[bindices_zero]
    nonzeros = np.arange(len(prediction))[~bindices_zero]

    # Try to get the subsets that classify as non-zero
    try:
        positives = itemgetter(*nonzeros)(subsets.copy())
    except:
        raise NoPositivesFound("Could not get positive (1's) from subsets")

    # Make sure in the case of only 1 line found, we still return a list and
    # not an array.
    if not isinstance(positives, tuple) and isinstance(positives,
                                                       type(np.zeros(1))):
        positives = (positives, )

    print(f"[INFO] {len(positives)} item lines found by the classifier")

    # output positives if this is provided
    if output_folder:
        io.save_images(positives, path=output_folder)

    # if required return negatives as a tuple
    if return_negatives:
        try:
            negatives = itemgetter(*zeros)(subsets.copy())

        except:
            raise Exception("Could not get 0's from subsets")

        if not isinstance(negatives, tuple) and isinstance(
                negatives, type(np.zeros(1))):
            negatives = (negatives, )

        print(
            f"[INFO] {len(negatives)} non-item lines found by the classifier")
        # override positives to contain the final results
        positives = (positives, negatives)

    return positives
Beispiel #34
0
def cxonepointleafbiased(**kwargs):
    """Factory for cxonepointleafbiased"""
    termpb = kwargs.get("termpb", 0.1)
    return toolz.partial(deap.gp.cxOnePointLeafBiased, termpb=termpb)
Beispiel #35
0
def mutnodereplacement(pset, **kwargs):
    """Factory for mutnodereplacement"""
    return toolz.partial(deap.gp.mutNodeReplacement, pset=pset)
Beispiel #36
0
with ignoring(ImportError):
    import lz4

    def _fixed_lz4_decompress(data):
        # lz4.LZ4_uncompress() doesn't accept memoryviews
        if isinstance(data, memoryview):
            data = data.tobytes()
        return lz4.LZ4_uncompress(data)

    compressions['lz4'] = {'compress': lz4.LZ4_compress,
                           'decompress': _fixed_lz4_decompress}
    default_compression = 'lz4'

with ignoring(ImportError):
    import blosc
    compressions['blosc'] = {'compress': partial(blosc.compress, clevel=5,
                                                 cname='lz4'),
                             'decompress': blosc.decompress}


default = config.get('compression', 'auto')
if default != 'auto':
    if default in compressions:
        default_compression = default
    else:
        raise ValueError("Default compression '%s' not found.\n"
                "Choices include auto, %s" % (
                    default, ', '.join(sorted(map(str, compressions)))))


def byte_sample(b, size, n):
    """ Sample a bytestring from many locations
Beispiel #37
0
def partial_serializer(serializer_name, dump_kwargs, load_kwargs):
    s = serializers[serializer_name]
    return Serializer(
        s.name,
        t.partial(s.dump, **dump_kwargs) if dump_kwargs else s.dump,
        t.partial(s.load, **load_kwargs) if load_kwargs else s.load)
Beispiel #38
0
def resource_json_gzip(uri):
    return resource_json(uri, open=partial(gzip.open, mode='rt'))
Beispiel #39
0
        except (ValueError, TypeError):
            if isinstance(data, memoryview):
                return lz4_decompress(data.tobytes())
            else:
                raise

    compressions['lz4'] = {
        'compress': _fixed_lz4_compress,
        'decompress': _fixed_lz4_decompress
    }
    default_compression = 'lz4'

with ignoring(ImportError):
    import blosc
    compressions['blosc'] = {
        'compress': partial(blosc.compress, clevel=5, cname='lz4'),
        'decompress': blosc.decompress
    }

default = config.get('compression', 'auto')
if default != 'auto':
    if default in compressions:
        default_compression = default
    else:
        raise ValueError("Default compression '%s' not found.\n"
                         "Choices include auto, %s" %
                         (default, ', '.join(sorted(map(str, compressions)))))


def byte_sample(b, size, n):
    """ Sample a bytestring from many locations
Beispiel #40
0
    def delete(self, id):
        return cs.chained_delete(self, id)

    def _filename(self, id):
        return cs.chained_filename(self, id)


### ArtifactSet logic


def _set_op(operator, *sets, labels=None):
    new_ids = t.reduce(operator, t.map(lambda s: s.artifact_ids, sets))
    return ArtifactSet(new_ids, labels)


set_union = t.partial(_set_op, ops.or_)
set_difference = t.partial(_set_op, ops.sub)
set_intersection = t.partial(_set_op, ops.and_)

artifact_set_properties = ['id', 'artifact_ids', 'created_at', 'labels']


class ArtifactSet(namedtuple('ArtifactSet', artifact_set_properties)):
    def __new__(cls, artifact_ids, labels=None, created_at=None, id=None):
        artifact_ids = t.map(_artifact_id, artifact_ids)
        labels = _check_labels_name(labels)
        ids = frozenset(artifact_ids)
        if id:
            set_id = id
        else:
            set_id = hash(ids)
Beispiel #41
0
def test_inline_ignores_curries_and_partials():
    dsk = {'x': 1, 'y': 2, 'a': (partial(add, 1), 'x'), 'b': (inc, 'a')}

    result = inline_functions(dsk, fast_functions=set([add]))
    assert 'a' not in set(result.keys())
Beispiel #42
0
    """
    img = resize(img)
    east_decode = unfold_args(fp.partial(decode, **context["boxes"]))
    arr = fp.compose(east_decode, forward, blobify)(img)

    rects, conf = arr[:, 1:], arr[:, 0]
    boxes = non_max_suppression(expand(rects, img.shape, **context["expand"]),
                                probs=conf)

    # preserve order by sorting on startx
    sorted_boxes = pd.DataFrame(boxes).sort_values(1).values

    return img, get_subsets(img, sorted_boxes)


boxesM = fp.partial(map, boxes)


def features(img, subsets):
    """
    Take an image and its subsets created from ``boxes`` and 
    produce histogram based features for each subset. 

    Parameters
    ----------
    img : numpy.array
        numpy array representation of an image.

    subsets : list
        list of numpy arrays of the subsets.
Beispiel #43
0
from io import BytesIO

import pytest

pd = pytest.importorskip('pandas')
dd = pytest.importorskip('dask.dataframe')

from toolz import partition_all, valmap, partial

from dask import compute
from dask. async import get_sync
from dask.dataframe.csv import read_csv_from_bytes, bytes_read_csv, read_csv
from dask.dataframe.utils import eq
from dask.utils import filetexts, filetext

compute = partial(compute, get=get_sync)

files = {
    '2014-01-01.csv': (b'name,amount,id\n'
                       b'Alice,100,1\n'
                       b'Bob,200,2\n'
                       b'Charlie,300,3\n'),
    '2014-01-02.csv': (b'name,amount,id\n'),
    '2014-01-03.csv': (b'name,amount,id\n'
                       b'Dennis,400,4\n'
                       b'Edith,500,5\n'
                       b'Frank,600,6\n')
}

header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n'
Beispiel #44
0
import os
from time import sleep
import sys

import pytest
from toolz import concat, valmap, partial

from dask import compute
from dask.compatibility import FileNotFoundError, unicode
from dask.utils import filetexts
from dask.bytes import compression
from dask.bytes.local import LocalFileSystem
from dask.bytes.core import (read_bytes, open_files, get_pyarrow_filesystem,
                             logical_size, get_fs_token_paths)

compute = partial(compute, scheduler='sync')

files = {'.test.accounts.1.json': (b'{"amount": 100, "name": "Alice"}\n'
                                   b'{"amount": 200, "name": "Bob"}\n'
                                   b'{"amount": 300, "name": "Charlie"}\n'
                                   b'{"amount": 400, "name": "Dennis"}\n'),
         '.test.accounts.2.json': (b'{"amount": 500, "name": "Alice"}\n'
                                   b'{"amount": 600, "name": "Bob"}\n'
                                   b'{"amount": 700, "name": "Charlie"}\n'
                                   b'{"amount": 800, "name": "Dennis"}\n')}


csv_files = {'.test.fakedata.1.csv': (b'a,b\n'
                                      b'1,2\n'),
             '.test.fakedata.2.csv': (b'a,b\n'
                                      b'3,4\n'),
Beispiel #45
0
    if not isoption:
        # a is not an option, this is just a
        return a

    b_dshape = discover(b)
    return Coalesce(
        a, b,
        DataShape(*(maxshape((a_dshape.shape, b_dshape.shape)) +
                    (promote(a_measure, b_dshape.measure), ))))


dshape_method_list = list()
schema_method_list = list()
method_properties = set()

dshape_methods = memoize(partial(select_functions, dshape_method_list))
schema_methods = memoize(partial(select_functions, schema_method_list))


@dispatch(DataShape)
def shape(ds):
    s = ds.shape
    s = tuple(int(d) if isinstance(d, Fixed) else d for d in s)
    return s


@dispatch(object)
def shape(expr):
    """ Shape of expression

    >>> symbol('s', '3 * 5 * int32').shape
Beispiel #46
0
def get_data(sackmann_dir, tour='atp', keep_davis_cup=False):

    all_csvs = glob(join(sackmann_dir, f'*{tour}_matches_????.csv'))
    all_csvs = sorted(all_csvs, key=lambda x: int(splitext(x)[0][-4:]))

    levels_to_drop = ['C', 'S']

    if not keep_davis_cup:
        levels_to_drop.append('D')

    data = pipe(
        all_csvs,
        # Read CSV
        lambda y: map(partial(pd.read_csv, encoding="ISO=8859-1"), y),
        # Drop NAs in important fields
        lambda y: map(
            lambda x: x.dropna(subset=['winner_name', 'loser_name', 'score']),
            y),
        # Drop retirements and walkovers
        # TODO: Make this optional
        lambda y: map(
            lambda x: x[~x['score'].astype(str).str.contains(
                'RET|W/O|DEF|nbsp|Def.')], y),
        # Drop scores that appear truncated
        lambda y: map(lambda x: x[x['score'].astype(str).str.len() > 4], y),
        # Drop challengers and futures
        # TODO: Make this optional too
        lambda y: map(lambda x: x[~x['tourney_level'].isin(levels_to_drop)], y
                      ),
        pd.concat,
    )

    round_numbers = {
        'R128': 1,
        'RR': 1,
        'R64': 2,
        'R32': 3,
        'R16': 4,
        'QF': 5,
        'SF': 6,
        'F': 7
    }

    # Drop rounds outside this list
    to_keep = data['round'].isin(round_numbers)
    data = data[to_keep]

    # Add a numerical round number
    data['round_number'] = data['round'].replace(round_numbers)

    # Add date information
    data['tourney_date'] = pd.to_datetime(
        data['tourney_date'].astype(int).astype(str), format='%Y%m%d')
    data['year'] = data['tourney_date'].dt.year

    # Sort by date and round and reset index
    data = data.sort_values(['tourney_date', 'round_number'])
    data = data.reset_index(drop=True)

    data['pts_won_serve_winner'] = data['w_1stWon'] + data['w_2ndWon']
    data['pts_won_serve_loser'] = data['l_1stWon'] + data['l_2ndWon']

    data['pts_played_serve_winner'] = data['w_svpt']
    data['pts_played_serve_loser'] = data['l_svpt']

    # Add serve % won
    data['spw_winner'] = (data['w_1stWon'] + data['w_2ndWon']) / data['w_svpt']
    data['spw_loser'] = (data['l_1stWon'] + data['l_2ndWon']) / data['l_svpt']

    return data
Beispiel #47
0
from __future__ import print_function, division, absolute_import

import pytest
from toolz import partial

from dask import compute, get
from dask.utils import filetexts
from dask.bytes import compression
from dask.bag.text import read_text

compute = partial(compute, get=get)

files = {
    '.test.accounts.1.json': ('{"amount": 100, "name": "Alice"}\n'
                              '{"amount": 200, "name": "Bob"}\n'
                              '{"amount": 300, "name": "Charlie"}\n'
                              '{"amount": 400, "name": "Dennis"}\n'),
    '.test.accounts.2.json': ('{"amount": 500, "name": "Alice"}\n'
                              '{"amount": 600, "name": "Bob"}\n'
                              '{"amount": 700, "name": "Charlie"}\n'
                              '{"amount": 800, "name": "Dennis"}\n')
}

expected = ''.join([files[v] for v in sorted(files)])

fmt_bs = ([(fmt, None) for fmt in compression.files] +
          [(fmt, 10) for fmt in compression.seekable_files] +
          [(fmt, None) for fmt in compression.seekable_files])
encodings = ['ascii', 'utf-8']  # + ['utf-16', 'utf-16-le', 'utf-16-be']
fmt_bs_enc = [(fmt, bs, encoding) for fmt, bs in fmt_bs
              for encoding in encodings]
Beispiel #48
0
            if isinstance(data, (memoryview, bytearray)):
                return lz4_decompress(bytes(data))
            else:
                raise

    compressions["lz4"] = {
        "compress": _fixed_lz4_compress,
        "decompress": _fixed_lz4_decompress,
    }
    default_compression = "lz4"

with ignoring(ImportError):
    import blosc

    compressions["blosc"] = {
        "compress": partial(blosc.compress, clevel=5, cname="lz4"),
        "decompress": blosc.decompress,
    }


default = dask.config.get("distributed.comm.compression")
if default != "auto":
    if default in compressions:
        default_compression = default
    else:
        raise ValueError(
            "Default compression '%s' not found.\n"
            "Choices include auto, %s"
            % (default, ", ".join(sorted(map(str, compressions))))
        )
Beispiel #49
0
def mutinsert(pset, **kwargs):
    """Factory for mutinsert"""
    return toolz.partial(deap.gp.mutInsert, pset=pset)
Beispiel #50
0
def nsBlockFilterInit(ns):
    nsSet(ns, "/blocks/filter/in", partial(nsGet(ns, "/usr/local/blocks/filter/in"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/inF", partial(nsGet(ns, "/usr/local/blocks/filter/inF"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/out", partial(nsGet(ns, "/usr/local/blocks/filter/out"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/outF", partial(nsGet(ns, "/usr/local/blocks/filter/outF"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/create", partial(nsGet(ns, "/usr/local/blocks/filter/task"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/server", partial(nsGet(ns, "/usr/local/blocks/filter/server"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/handler", partial(nsGet(ns, "/usr/local/blocks/filter/handler"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/reject", partial(nsGet(ns, "/usr/local/blocks/filter/reject"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/call", partial(nsGet(ns, "/usr/local/blocks/filter/call"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/empty", partial(nsGet(ns, "/usr/local/blocks/filter/empty"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/exists", partial(nsGet(ns, "/usr/local/blocks/filter/exists"), "/blocks/filter"))
    nsSet(ns, "/blocks/filter/configured", True)
    return True
Beispiel #51
0
    def parents(self):
        # Note: the last created Script object appears to bork the older ones. Must keep making new Script objects!

        # Note: jedi appears to already do enough caching. It does not significantly
        # improve performance to cache the parents.

        #acceptable_name_types = (jedi.parser.tree.Name,
        #                         jedi.evaluate.representation.InstanceElement)

        if self.definition and self.definition.module_path:
            script = jedi.api.Script(
                source_path=self.definition.module_path,
                sys_path=self.definition._evaluator.sys_path,
                line=self.definition.line,
                column=self.definition.column)

            usages = catch_errors(
                tz.partial(jedi_alt.usages.usages_with_additional_modules,
                           script, self.usage_resolution_modules), [],
                'while finding usages of {}'.format(self.code_element.name))

        elif self.code_element.call_pos[0]:
            call_pos_script = jedi.api.Script(
                source_path=self.code_element.call_pos[0],
                sys_path=self.definition._evaluator.sys_path
                if self.definition else self.sys_path,
                line=self.code_element.call_pos[1][0],
                column=self.code_element.call_pos[1][1])

            usages = catch_errors(
                tz.partial(jedi_alt.usages.usages_with_additional_modules,
                           call_pos_script, self.usage_resolution_modules), [],
                'while finding usages of {}'.format(self.code_element.name))

        elif self.definition:
            script = create_import_script(
                self.definition._evaluator.sys_path
                if self.definition else self.sys_path, self.code_element.name)

            usages = [
                usage for usage in catch_errors(
                    tz.partial(jedi_alt.usages.usages_with_additional_modules,
                               script, self.usage_resolution_modules), [],
                    'while finding usages of {}'.format(
                        self.code_element.name)) if usage.module_name
            ]

        else:
            return ()

        _unfiltered_parents = []
        positions = set()

        for usage in usages:
            tree_name = usage._name.tree_name
            if tree_name:
                position = (usage.module_path, tree_name.start_pos,
                            tree_name.end_pos)
            else:
                position = (None, (None, None), (None, None))

            if position not in positions or position == (None, (None, None),
                                                         (None, None)):
                _usage_parent = parent_definition(usage)

                if _usage_parent.module_path:
                    JediCodeElementNode.usage_resolution_modules |= frozenset(
                        (_usage_parent._name.get_root_context(), ))

                usage_node = JediCodeElementNode.from_definition(
                    'parent', position, _usage_parent)

                # check if this usage is actually the definition of the
                # current node, and is therefore already covered by the
                # "- [sig]" node.
                if (usage_node.code_element.call_pos[0]
                        == self.code_element.path
                        and usage_node.code_element.call_pos[1]
                        == self.code_element.start_pos
                        and usage_node.code_element.type == 'module'):

                    logger.info(
                        'Usages: Skipped definition of {} at {}:{}.'.format(
                            self.code_element.name,
                            usage_node.code_element.name,
                            usage_node.code_element.call_pos[1][0]))
                    continue
                else:
                    _unfiltered_parents.append(usage_node)
            positions.add(position)

        _cleanup_signal_queue()

        return _unfiltered_parents
Beispiel #52
0
def nsBlockFilterTask(ns, block_path, name, _handler=None, _reject=None, **kw):
    task_path = "/tasks/filter/{}".format(name)
    if name in nsDir(ns, task_path):
        return True
    nsMkdir(ns, task_path)
    nsSet(ns, "{}/id".format(task_path), str(uuid.uuid4()))
    nsSet(ns, "{}/args".format(task_path), ())
    nsSet(ns, "{}/kw".format(task_path), {})
    nsSet(ns, "{}/blocking".format(task_path), False)
    nsSet(ns, "{}/in_q".format(task_path), Queue())
    nsSet(ns, "{}/out_q".format(task_path), Queue())
    nsSet(ns, "{}/in".format(task_path), partial(nsGet(ns, "/blocks/filter/in"), task_path))
    nsSet(ns, "{}/inF".format(task_path), partial(nsGet(ns, "/blocks/filter/inF"), task_path))
    nsSet(ns, "{}/out".format(task_path), partial(nsGet(ns, "/blocks/filter/out"), task_path))
    nsSet(ns, "{}/outF".format(task_path), partial(nsGet(ns, "/blocks/filter/outF"), task_path))
    nsSet(ns, "{}/empty".format(task_path), partial(nsGet(ns, "/blocks/filter/empty"), task_path))
    nsSet(ns, "{}/server".format(task_path), partial(nsGet(ns, "/blocks/filter/server"), task_path))
    if _handler is None:
        nsSet(ns, "{}/handler".format(task_path), partial(nsGet(ns, "/blocks/filter/handler"), task_path))
    else:
        nsSet(ns, "{}/handler".format(task_path), partial(_handler, block_path, task_path))
    if _reject is None:
        nsSet(ns, "{}/reject".format(task_path), partial(nsGet(ns, "/blocks/filter/reject"), task_path))
    else:
        nsSet(ns, "{}/reject".format(task_path), partial(_reject, block_path, task_path))
    nsSet(ns, "{}/call".format(task_path), partial(nsGet(ns, "/blocks/filter/call"), task_path))
    for k in kw:
        nsSet(ns, "{}/{}".format(task_path, k), kw[k])
    nsDaemon(ns, "TASK:filter:{}".format(name), nsGet(ns, "{}/server".format(task_path)), _raw=True)
    return True
Beispiel #53
0
def main(config_file, model_name, fit_hyperparams, folds, submission, cv):
    print('Config file: ' + config_file)
    print('Model: ' + model_name)
    print('Fit hyperparams? ' + str(fit_hyperparams))
    print('Folds for which predictions will be added: ' + str(folds))
    print('Generate submission file? ' + str(submission))
    print('Cross-validate? ' + str(cv))

    with open(config_file, 'r') as f:
        config = yaml.load(f)
    with open(config['hyperparams_file'], 'r') as f:
        hyperparams = yaml.load(f)
        
    # Load data.
    print('Loading data...')
    train_df = pd.read_pickle(config['train'])
    test_df = pd.read_pickle(config['test'])

    # The model names and their definitions.
    model_dict = {'test':TestClassifier,
                  'nn':NN, 
                  'nnBagged':toolz.partial(StratifiedBaggingClassifier,
                                           base_estimator=NN(**hyperparams['nn']['constructor']),
                                           fit_params=hyperparams['nn']['fit']),
                  'xgbBagged':toolz.partial(StratifiedBaggingClassifier,
                                            base_estimator=XGBClassifier(**hyperparams['xgb']['constructor']),
                                            fit_params=hyperparams['xgb']['fit']),
                  'lgbmBagged':toolz.partial(StratifiedBaggingClassifier,
                                             base_estimator=LGBMClassifier(**hyperparams['lgbm']['constructor']),
                                             fit_params=hyperparams['lgbm']['fit']),
                  'lgbm':LGBMClassifier,
                  'xgb':XGBClassifier,
                  'xgbHist':XGBoostWrapper,
                  'svm':toolz.partial(svm.SVC, probability=True),
                  'randomForest':toolz.partial(RandomForestClassifier),
                  'logisticRegression':toolz.partial(LogisticRegression, class_weight='balanced'),
                  'logisticRegressionBagged':toolz.partial(StratifiedBaggingClassifier,
                                            base_estimator=LogisticRegression(**hyperparams['logisticRegression']['constructor']),
                                            fit_params=hyperparams['logisticRegression']['fit'])}

    if fit_hyperparams:
        print('Finding hyperparameters...')
        # Construct distributions from tuning_hyperparams.
        param_dists = {}
        tuning_hyperparams = hyperparams[model_name]['tuning_hyperparams']
        constructor_hyperparams = hyperparams[model_name]['constructor']
        nontuning_hyperparams = {x:constructor_hyperparams[x] for x in constructor_hyperparams if x not in tuning_hyperparams}
        for param in tuning_hyperparams:
            vals = tuning_hyperparams[param]['vals']
            if tuning_hyperparams[param]['type'] == 'int':
                min = np.min(vals)
                max = np.max(vals)
                param_dists[param] = randint(min, max + 1) # randint is like [min, max).
            elif tuning_hyperparams[param]['type'] == 'float':
                min = np.min(vals)
                max = np.max(vals)
                param_dists[param] = uniform(loc=min, scale=(max - min))
            elif tuning_hyperparams[param]['type'] == 'string':
                param_dists[param] = vals
            else:
                raise ValueError("Unexpected tuning parameter type: " + str(tuning_hyperparams[param]['type']))
        clf = RandomizedSearchCV(model_dict[model_name](**nontuning_hyperparams),
                                 param_distributions=param_dists,
                                 n_iter=config['tuning']['n_iter'],
                                 n_jobs=config['tuning']['n_jobs'],
                                 cv=config['tuning']['n_splits'],
                                 scoring='roc_auc', verbose=5)
        X = train_df.drop(['target', 'fold'], axis=1)
        y = train_df.loc[:, 'target']
        clf.fit(X=X, y=y, **hyperparams[model_name]['fit'])
        print('Found best hyperparams:')
        print(clf.best_params_)
        print('With AUC score:')
        print(clf.best_score_)

        # Put grid search best params in hyperparams dict.
        # Floats are in numpy format, and trying to write them as-is to file
        # causes it to be filled with junk, so convert to normal float first if
        # necessary.
        for param, value in clf.best_params_.items():
            try:
                sanitised_value = value.item() # Gets number from numpy class.
            except AttributeError as e: # Was plain number anyway.
                sanitised_value = value
            hyperparams[model_name]['constructor'][param] = sanitised_value
        # Save hyperparams.
        with open(config['hyperparams_file'], 'w') as f:
            yaml.dump(hyperparams, f, default_flow_style=False, indent=2)
        print('Wrote best params to ' + str(config['hyperparams_file']))

    if cv: # Cross-validate model to estimate accuracy.
            # Define model.
            print('Define model...')
            model = model_dict[model_name](**hyperparams[model_name]['constructor'])
            X = train_df.drop(['target', 'fold'], axis=1)
            y = train_df.loc[:, 'target']
            n_splits = 3
            fit_params = hyperparams[model_name]['fit']
            print("Estimating scores using cross-validation...")
            scores = cross_val_score(estimator=model, X=X, y=y, cv=n_splits, verbose=5, fit_params=fit_params, scoring=gini_scoring_fn, n_jobs=1)
            # Report error.
            print('Gini score mean (standard deviation): ' + str(np.mean(scores)) + ' (' +  str(np.sqrt(np.var(scores))) + ')')

    if submission: # Train and produce submission file.
        # Define model.
        print('Define model...')
        model = model_dict[model_name](**hyperparams[model_name]['constructor'])
        print('Fitting...')
        model.fit(X=train_df.drop(['target', 'fold'], axis=1),
                  y=train_df.loc[:, 'target'])
        # Create submission file with predictions.
        print("Predicting...")
        submit_file = config['submit_prefix'] + '_' + model_name + '_' + datetime_for_filename() + '.csv'
        (test_df
         .assign(target=model.predict_proba(test_df.drop('id', axis=1))[:,1])
         .loc[:, ['id', 'target']]
         .to_csv(submit_file, float_format=float_format, index=False))
        print("Saved submit file to " + submit_file)
    elif not folds is None: # Train with folds, for stacking.
        # Check that folds are valid.
        bad_folds = [x for x in folds if not x in range(-1, config['n_folds'])]
        if len(bad_folds) > 0:
            raise ValueError("These specified folds do not exist: " + str(bad_folds))
        # Define model.
        print('Define model...')
        model = model_dict[model_name](**hyperparams[model_name]['constructor'])
        model_col_name = 'model_' + model_name
        for fold in folds: 
            print("Fitting for fold " + str(fold) + "...")
            if fold != -1: # Fit for a specific fold.
                print('Fitting...')
                train_columns = list(set(train_df.columns) - set(['fold', 'target']) - set([x for x in train_df.columns if x.startswith('model_')]))
                model.fit(X=train_df.loc[train_df['fold'] != fold, train_columns], 
                          y=train_df.loc[train_df['fold'] != fold, 'target'],
                        **(hyperparams[model_name]['fit']))
                # Add predictions for fold.
                print("Predicting...")
                train_df.loc[train_df['fold'] == fold, model_col_name] = model.predict_proba(train_df.loc[train_df['fold'] == fold, train_columns])[:,1]
                train_df.to_pickle(config['train'])
                print('Added predictions for model ' + model_name + ', fold ' + str(fold) + ' to column ' + model_col_name + ' of ' +  config['train'])
            else: # Ignore folds and fit all data.
                print('Fitting...')
                columns_to_drop = ['target', 'fold'] + [x for x in train_df.columns if x.startswith('model_')]
                model.fit(X=train_df.drop(columns_to_drop, axis=1), 
                        y=train_df.loc[:, 'target'])
                # Add predictions for whole test set to test CSV.
                print("Predicting...")
                test_file = config['test']
                test_columns_to_drop = ['id'] + [x for x in test_df.columns if x.startswith('model_')]
                (test_df
                .assign(**{model_col_name:model.predict_proba(test_df.drop(test_columns_to_drop, axis=1))[:,1]})
                .to_pickle(test_file))
                print('Added predictions for model ' + model_name + ' to column ' + model_col_name + ' of ' + test_file)
Beispiel #54
0
def test_sensitive_to_partials():
    assert (delayed(partial(add, 10), pure=True)(2)._key != delayed(
        partial(add, 20), pure=True)(2)._key)
Beispiel #55
0
import typing as t
from datetime import datetime
from operator import itemgetter

import snug
from toolz import flip, partial

from . import types

registry = snug.load.PrimitiveRegistry({
    datetime: partial(flip(datetime.strptime),  '%Y-%m-%dT%H:%M:%SZ'),
    **{
        c: c for c in [
            int,
            float,
            bool,
            str,
            types.Issue.State
        ]
    }
}) | snug.load.GenericRegistry({
    t.List: snug.load.list_loader
}) | snug.load.get_optional_loader | snug.load.AutoDataclassRegistry()
Beispiel #56
0
# -*- coding: utf-8 -*-
"""
parse_helpers.py 

This module contains helper functions used in parsing scraper data. 
"""

from toolz import partial
import requests
from bs4 import BeautifulSoup
import re

is_e_type = lambda element_type, element: element.name == element_type
is_bold = partial(is_e_type, 'b')
is_anchor = partial(is_e_type, 'a')
is_strong = partial(is_e_type, 'strong')
b_or_strong = lambda e: is_strong(e) or is_bold(e)

a_cleanse = lambda txt: txt.replace(u'Â', u'').replace('\r\n', '\n')
get_content = lambda url: requests.get(url).content
get_soup = lambda url: BeautifulSoup(get_content(url))

starts_with_end_tag = re.compile(r'^(\s*<\s*/.*?>)+')

# TODO: refactor and document


def split(soup, splitter):
    """
    TODO: write this docstring 
    """
Beispiel #57
0
from __future__ import print_function, division, absolute_import

from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("dumbeddown")
sc = SparkContext(conf=conf)

from uuid import uuid4
from sabaody import Island, run_island, problem_constructor, getQualifiedName
from toolz import partial

run_id = str(uuid4())
num_islands = 4

island_ids = [str(uuid4()) for x in range(num_islands)]
islands = [Island(u, problem_constructor, partial(getQualifiedName, 'B2', str(run_id)), 'luna', 11211) for u in island_ids]
print(sc.parallelize(islands).map(run_island).collect())
Beispiel #58
0
def overlap_internal(x, axes):
    """ Share boundaries between neighboring blocks

    Parameters
    ----------

    x: da.Array
        A dask array
    axes: dict
        The size of the shared boundary per axis

    The axes input informs how many cells to overlap between neighboring blocks
    {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis
    """
    dims = list(map(len, x.chunks))
    expand_key2 = partial(expand_key, dims=dims, axes=axes)

    # Make keys for each of the surrounding sub-arrays
    interior_keys = pipe(
        x.__dask_keys__(), flatten, map(expand_key2), map(flatten), concat, list
    )

    name = "overlap-" + tokenize(x, axes)
    getitem_name = "getitem-" + tokenize(x, axes)
    interior_slices = {}
    overlap_blocks = {}
    for k in interior_keys:
        frac_slice = fractional_slice((x.name,) + k, axes)
        if (x.name,) + k != frac_slice:
            interior_slices[(getitem_name,) + k] = frac_slice
        else:
            interior_slices[(getitem_name,) + k] = (x.name,) + k
            overlap_blocks[(name,) + k] = (
                concatenate3,
                (concrete, expand_key2((None,) + k, name=getitem_name)),
            )

    chunks = []
    for i, bds in enumerate(x.chunks):
        depth = axes.get(i, 0)
        if isinstance(depth, tuple):
            left_depth = depth[0]
            right_depth = depth[1]
        else:
            left_depth = depth
            right_depth = depth

        if len(bds) == 1:
            chunks.append(bds)
        else:
            left = [bds[0] + right_depth]
            right = [bds[-1] + left_depth]
            mid = []
            for bd in bds[1:-1]:
                mid.append(bd + left_depth + right_depth)
            chunks.append(left + mid + right)

    dsk = merge(interior_slices, overlap_blocks)
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x])

    return Array(graph, name, chunks, meta=x)
Beispiel #59
0
                                    'result/olt_info.txt')

authenticate('61.155.48.36:7474', neo4j_username, neo4j_password)
graph = Graph("http://61.155.48.36:7474/db/data")


def clear_log():
    for f in [log_file, result_file]:
        if os.path.exists(f):
            os.remove(f)
        os.mknod(f)


######################card check################################
zte_card_check = partial(Zte.card_check,
                         username=zte_olt_username,
                         password=zte_olt_password)
hw_card_check = partial(Huawei.card_check,
                        username=hw_olt_username,
                        password=hw_olt_password)


def get_card(olt):
    functions = dict(zte=zte_card_check, hw=hw_card_check)
    no_company = lambda x: ['fail', None]
    ip, company = olt[:2]
    return functions.get(company, no_company)(ip) + [','.join(olt)]


def card_entry(info):
    create_card_node = lambda x: graph.create(
Beispiel #60
0
def get_data(filters):
    compute_data = partial(_compute_days, today())
    return compose(list,
                   partial(map, compute_data))(_get_inpatient_records(filters))