def delete(ctx, cfg):
    '''Delete existing predictions'''

    stmt = db.delete_predictions(cfg, get('cx', ctx), get('cy', ctx))
    db.execute_statement(cfg, stmt)

    return ctx
def test_unload_segments():
    inputs = {'segments': 1, 'other': 2}

    outputs = segaux.unload_segments(inputs)

    assert get('segments', outputs, None) is None
    assert get('other', outputs) is 2
Exemple #3
0
def test_segment_runs_as_expected(client):
    '''
    As a blackmagic user, when I send cx, cy, & acquired range
    via HTTP POST, change segments are detected and saved
    so that they can be retrieved later.
    '''

    response = client.post('/segment',
                           json={
                               'cx': test.cx,
                               'cy': test.cy,
                               'acquired': test.acquired
                           })

    chips = _ceph.select_chip(cx=test.cx, cy=test.cy)

    pixels = _ceph.select_pixels(cx=test.cx, cy=test.cy)
    print("PIXEL LENGTH:{}".format(len(pixels)))
    print("PIXEL TYPE:{}".format(type(pixels)))

    segments = _ceph.select_segments(cx=test.cx, cy=test.cy)

    assert response.status == '200 OK'
    assert get('cx', response.get_json()) == test.cx
    assert get('cy', response.get_json()) == test.cy
    assert get('acquired', response.get_json()) == test.acquired
    assert get('exception', response.get_json(), None) == None

    assert len(list(map(lambda x: x, chips))) == 1
    assert len(list(map(lambda x: x, pixels))) == 10000
    assert len(list(map(lambda x: x, segments))) == 10000
 def tagRawSentence(self, rawLine, DICT, word_dict, pos_dict):
     line = initializeSentence(DICT, rawLine)
     sen = []
     wordTags = line.split()
     for i in range(len(wordTags)):
         fwObject = FWObject.getFWObject(wordTags, i)
         word, tag = getWordTag(wordTags[i])
         node = self.findFiredNode(fwObject)
         if node.depth > 0:
             current_dict = ct.get(word.lower(), word_dict, default=0)
             if current_dict == 0:
                 sen.append(
                     (0, ct.get(node.conclusion.lower(),
                                pos_dict,
                                default=0), 0))
             else:
                 sen.append(
                     (ct.get("index", current_dict),
                      ct.get(node.conclusion.lower(), pos_dict,
                             default=0), ct.get("domain", current_dict)))
         else:  # Fired at root, return initialized tag
             current_dict = ct.get(word.lower(), word_dict, default=0)
             if current_dict == 0:
                 sen.append((0, ct.get(tag.lower(), pos_dict), 0))
             else:
                 sen.append(
                     (ct.get("index", current_dict),
                      ct.get(tag.lower(), pos_dict,
                             default=0), ct.get("domain", current_dict)))
     return sen
def group_data(ctx):
    grouper = lambda x: 'defaults' if x['sday'] == '0001-01-01' and x[
        'eday'] == '0001-01-01' else 'data'
    groups = groupby(grouper, ctx['data'])
    return merge(ctx, {
        'data': get('data', groups, []),
        'defaults': get('defaults', groups, [])
    })
def delete(ctx, cfg):
    cx = int(get('cx', ctx))
    cy = int(get('cy', ctx))

    _ceph.delete_chip(cx, cy)
    _ceph.delete_pixels(cx, cy)
    _ceph.delete_segments(cx, cy)

    return ctx
Exemple #7
0
def delete(ctx, cfg):
    cx = int(get('cx', ctx))
    cy = int(get('cy', ctx))
    db.execute_statements(cfg, [
        db.delete_chip(cfg, cx, cy),
        db.delete_pixels(cfg, cx, cy),
        db.delete_segments(cfg, cx, cy)
    ])
    return ctx
Exemple #8
0
def log_request(ctx):

    cx = get('cx', ctx, None)
    cy = get('cy', ctx, None)
    a = get('acquired', ctx, None)

    logger.info('POST /segment {cx}, {cy}, {a}'.format(cx=cx, cy=cy, a=a))

    return ctx
def exception_handler(ctx, http_status, name, fn):
    try:
        return fn(ctx)
    except Exception as e:
        
        return do(logger.error, {'cx': get('cx', ctx, None),
                                 'cy': get('cy', ctx, None),
                                 'acquired': get('acquired', ctx, None),
                                 'exception': '{name} exception: {ex}'.format(name=name, ex=e),
                                 'http_status': http_status})
def load_data(ctx, cfg):
    return assoc(
        ctx, 'data',
        thread_first(
            ctx, partial(segments, cfg=cfg), partial(segaux.aux,
                                                     cfg=cfg), segaux.combine,
            segaux.unload_segments, segaux.unload_aux, extract_segments,
            partial(segaux.prediction_dates,
                    month=get("month", ctx),
                    day=get("day",
                            ctx)), segaux.average_reflectance, reformat))
Exemple #11
0
def log_request(ctx):
    '''Create log message for HTTP request'''

    tx = get('tx', ctx, None)
    ty = get('ty', ctx, None)
    a  = get('acquired', ctx, None)
    d  = get('date', ctx, None)
    c  = get('chips', ctx, None)
    
    logger.info("POST /tile {x},{y},{a},{d},{c}".format(x=tx, y=ty, a=a, d=d, c=c))
        
    return ctx
 def wrapper(*args, **kwargs):
     start = datetime.now()
     ctx = fn(*args, **kwargs)
     
     d = {'cx': get('cx', ctx, None),
          'cy': get('cy', ctx, None),
          'acquired': get('acquired', ctx, None)}
         
     logger.info(assoc(d,
                       '{name}_elapsed_seconds'.format(name=fn.__name__),
                       (datetime.now() - start).total_seconds()))            
     return ctx
Exemple #13
0
def average_reflectance_fn(segment):
    '''Add average reflectance values into dataset'''

    avgrefl = lambda intercept, slope, ordinal: add(intercept,
                                                    mul(slope, ordinal))

    date = arrow.get(get('date', segment)).datetime.toordinal()

    ar = {
        'blar':
        avgrefl(get('blint', segment), spectral_slope('blcoef', segment),
                date),
        'grar':
        avgrefl(get('grint', segment), spectral_slope('grcoef', segment),
                date),
        'niar':
        avgrefl(get('niint', segment), spectral_slope('nicoef', segment),
                date),
        'rear':
        avgrefl(get('reint', segment), spectral_slope('recoef', segment),
                date),
        's1ar':
        avgrefl(get('s1int', segment), spectral_slope('s1coef', segment),
                date),
        's2ar':
        avgrefl(get('s2int', segment), spectral_slope('s2coef', segment),
                date),
        'thar':
        avgrefl(get('thint', segment), spectral_slope('thcoef', segment), date)
    }

    return merge(segment, ar)
def parameters(r):
    '''Check HTTP request parameters'''

    tx = get('tx', r, None)
    ty = get('ty', r, None)
    acquired = get('acquired', r, None)
    chips = get('chips', r, None)
    date = get('date', r, None)

    if (tx is None or ty is None or acquired is None or chips is None
            or date is None):
        raise Exception(
            'tx, ty, acquired, chips and date are required parameters')
    else:
        return {
            'tx':
            int(tx),
            'ty':
            int(ty),
            'acquired':
            acquired,
            'date':
            date,
            'chips':
            list(map(lambda chip: (int(first(chip)), int(second(chip))),
                     chips)),
            'test_data_exception':
            get('test_data_exception', r, None),
            'test_training_exception':
            get('test_training_exception', r, None),
            'test_save_exception':
            get('test_save_exception', r, None)
        }
def test_prediction_runs_as_expected(client):
    '''
    As a blackmagic user, when I send tx, ty, acquired, month, day and chip list
    via HTTP POST, predictions are generated and saved
    so that they can be retrieved later.
    '''

    create_prediction_test_data(client)    

    # test prediction    
    response = client.post('/prediction',
                           json={'tx': test.tx,
                                 'ty': test.ty,
                                 'cx': test.cx,
                                 'cy': test.cy,
                                 'month': test.prediction_month,
                                 'day': test.prediction_day,
                                 'acquired': test.acquired})

    predictions = _ceph.select_predictions(cx=test.cx, cy=test.cy)
     
    assert response.status == '200 OK'
    assert get('tx', response.get_json()) == test.tx
    assert get('ty', response.get_json()) == test.ty
    assert get('cx', response.get_json()) == test.cx
    assert get('cy', response.get_json()) == test.cy
    
    assert get('acquired', response.get_json()) == test.acquired
    assert get('month', response.get_json()) == test.prediction_month
    assert get('day', response.get_json()) == test.prediction_day
    assert get('exception', response.get_json(), None) == None

    # The number of predictions is dictated by the NLCDTRN dataset for the chip,
    # and the number of non-zero classifications available.
    assert len([p for p in predictions]) == 30000
Exemple #16
0
def prediction_dates(segments, month, day):

    for s in segments:
        default_date = default_prediction_date(s)

        if default_date:
            yield assoc(s, 'date', default_date)
        else:
            dates = prediction_date_fn(sday=get('sday', s),
                                       eday=get('eday', s),
                                       month=month,
                                       day=day)
            for date in dates:
                yield assoc(s, 'date', date)
Exemple #17
0
    def wrapper(*args, **kwargs):
        start = datetime.now()
        ctx = fn(*args, **kwargs)
        
        d = {"tx":get("tx", ctx, None),
             "ty":get("ty", ctx, None),
             "date":get("date", ctx, None),
             "acquired":get("acquired", ctx, None),
             "chips":count(get("chips", ctx, []))}
                        
        logger.info(json.dumps(assoc(d,
                                     "{name}_elapsed_seconds".format(name=fn.__name__),
                                     (datetime.now() - start).total_seconds())))

        return ctx
def detection(ctx, cfg):

    with workers(cfg) as w:
        if get('test_detection_exception', ctx, None) is not None:
            return merge(ctx, exception(msg='test_detection_exception', http_status=500))
        else:
            return merge(ctx, {'detections': list(flatten(w.map(detect, take(ctx['test_pixel_count'], ctx['timeseries']))))})
def respond(ctx):
    
    body = {'cx': get('cx', ctx, None),
            'cy': get('cy', ctx, None),
            'acquired': get('acquired', ctx, None)}

    e = get('exception', ctx, None)
    
    if e:
        response = jsonify(assoc(body, 'exception', e))
    else:
        response = jsonify(body)

    response.status_code = get('http_status', ctx, 200)

    return response
Exemple #20
0
def compute(t, lhs, rhs):
    """ Join Operation for Python Streaming Backend

    Note that a pure streaming Join is challenging/impossible because any row
    in one seq might connect to any row in the other, requiring simultaneous
    complete access.

    As a result this approach compromises and fully realizes the LEFT sequence
    while allowing the RIGHT sequence to stream.  As a result

    Always put your bigger table on the RIGHT side of the Join.
    """
    lhs = compute(t.lhs, lhs)
    rhs = compute(t.rhs, rhs)

    on_left = rowfunc(t.lhs[t.on_left])
    on_right = rowfunc(t.rhs[t.on_right])

    right_columns = list(range(len(t.rhs.columns)))
    for col in listpack(t.on_right):
        right_columns.remove(t.rhs.columns.index(col))

    get_right = lambda x: type(x)(get(right_columns, x))

    lhs_dict = groupby(on_left, lhs)

    for row in rhs:
        try:
            key = on_right(row)
            matches = lhs_dict[key]
            for match in matches:
                yield match + get_right(row)
        except KeyError:
            pass
    def tagRawSentenceHash(self, rawLine, DICT, word_dict):
        line = initializeSentence(DICT, rawLine)

        sen = []
        wordTags = line.split()

        for i in range(len(wordTags)):
            fwObject = FWObject.getFWObject(wordTags, i)
            word, tag = getWordTag(wordTags[i])
            node = self.findFiredNode(fwObject)

            #Only hash word once and block out-of-lexicon words
            word_hash = murmurhash3_32(word, seed=0)
            try:
                word_cat = ct.get(word_hash, word_dict)
            except:
                word_cat = 0
                word_hash = 0

#Format and return
            if node.depth > 0:
                sen.append((word_hash, murmurhash3_32(node.conclusion,
                                                      seed=0), word_cat))
            else:  # Fired at root, return initialized tag
                sen.append((word_hash, murmurhash3_32(tag, seed=0), word_cat))

        return sen
Exemple #22
0
def test_segment_cassandra_exception(client):
    '''
    As a blackmagic user, when an exception occurs saving 
    chips, pixels & segments to Cassandra, an HTTP 500 is issued
    with a descriptive message so that the issue may be 
    investigated, corrected & retried.
    '''

    cx = test.cx
    cy = test.cy
    a = test.acquired

    delete_detections(test.cx, test.cy)

    response = client.post('/segment',
                           json={
                               'cx': cx,
                               'cy': cy,
                               'acquired': a,
                               'test_cassandra_exception': True
                           })

    chips = db.execute_statement(cfg=app.cfg,
                                 stmt=db.select_chip(cfg=app.cfg,
                                                     cx=test.cx,
                                                     cy=test.cy))

    pixels = db.execute_statement(cfg=app.cfg,
                                  stmt=db.select_pixels(cfg=app.cfg,
                                                        cx=test.cx,
                                                        cy=test.cy))

    segments = db.execute_statement(cfg=app.cfg,
                                    stmt=db.select_segments(cfg=app.cfg,
                                                            cx=test.cx,
                                                            cy=test.cy))
    assert response.status == '500 INTERNAL SERVER ERROR'
    assert get('cx', response.get_json()) == cx
    assert get('cy', response.get_json()) == cy
    assert get('acquired', response.get_json()) == a
    assert type(get('exception', response.get_json())) is str
    assert len(get('exception', response.get_json())) > 0

    assert len(list(map(lambda x: x, chips))) == 0
    assert len(list(map(lambda x: x, pixels))) == 0
    assert len(list(map(lambda x: x, segments))) == 0
Exemple #23
0
def aux_filter(ctx):

    return assoc(
        ctx, 'aux',
        dict(
            list(
                filter(lambda d: first(get('nlcdtrn', second(d))) != 0,
                       ctx['aux'].items()))))
def test_aux():
    inputs = {'cx': test.cx,
              'cy': test.cy,
              'acquired': test.acquired}

    outputs = segaux.aux(inputs, blackmagic.cfg)
    
    assert get('aux', outputs, None) is not None
def test_tile_bad_parameters(client):
    '''
    As a blackmagic user, when I don't send tx, ty, acquired, date & chips
    via HTTP POST the HTTP status is 400 and the response body tells
    me the required parameters so that I can send a good request.
    '''

    tx = "not-an-integer"
    ty = test.ty
    acquired = test.acquired
    chips = test.chips
    date = test.training_date

    delete_tile(test.tx, test.ty)

    response = client.post('/tile',
                           json={
                               'tx': tx,
                               'ty': ty,
                               'acquired': acquired,
                               'chips': chips,
                               'date': date
                           })

    tiles = _ceph.select_tile(tx=test.tx, ty=test.ty)

    assert response.status == '400 BAD REQUEST'
    assert get('tx', response.get_json()) == tx
    assert get('ty', response.get_json()) == ty
    assert get('acquired', response.get_json()) == acquired
    assert get('date', response.get_json()) == date
    assert get('chips', response.get_json()) == count(chips)
    assert type(get('exception', response.get_json())) is str
    assert len(get('exception', response.get_json())) > 0
    assert len(list(map(lambda x: x, tiles))) == 0
Exemple #26
0
def test_segment_merlin_no_input_data(client):
    '''
    As a blackmagic user, when no input data is available
    to build a timeseries, an HTTP 500 is issued with a message
    indicating "no input data" so that I know change detection
    cannot run for this time & space.
    '''

    cx = test.cx
    cy = test.cy
    a = '1975/1976'

    delete_detections(test.cx, test.cy)

    response = client.post('/segment',
                           json={
                               'cx': cx,
                               'cy': cy,
                               'acquired': a
                           })

    chips = db.execute_statement(cfg=app.cfg,
                                 stmt=db.select_chip(cfg=app.cfg,
                                                     cx=test.cx,
                                                     cy=test.cy))

    pixels = db.execute_statement(cfg=app.cfg,
                                  stmt=db.select_pixels(cfg=app.cfg,
                                                        cx=test.cx,
                                                        cy=test.cy))

    segments = db.execute_statement(cfg=app.cfg,
                                    stmt=db.select_segments(cfg=app.cfg,
                                                            cx=test.cx,
                                                            cy=test.cy))
    assert response.status == '500 INTERNAL SERVER ERROR'
    assert get('cx', response.get_json()) == cx
    assert get('cy', response.get_json()) == cy
    assert get('acquired', response.get_json()) == a
    assert type(get('exception', response.get_json())) is str
    assert len(get('exception', response.get_json())) > 0

    assert len(list(map(lambda x: x, chips))) == 0
    assert len(list(map(lambda x: x, pixels))) == 0
    assert len(list(map(lambda x: x, segments))) == 0
Exemple #27
0
def test_segment_bad_parameters(client):
    '''
    As a blackmagic user, when I don't send cx, cy, & acquired range
    via HTTP POST the HTTP status is 400 and the response body tells
    me the required parameters so that I can send a good request.
    '''

    # bad parameters
    cx = None
    cy = test.cy
    a = test.acquired

    delete_detections(test.cx, test.cy)

    response = client.post('/segment',
                           json={
                               'cx': cx,
                               'cy': cy,
                               'acquired': a
                           })

    chips = db.execute_statement(cfg=app.cfg,
                                 stmt=db.select_chip(cfg=app.cfg,
                                                     cx=test.cx,
                                                     cy=test.cy))

    pixels = db.execute_statement(cfg=app.cfg,
                                  stmt=db.select_pixels(cfg=app.cfg,
                                                        cx=test.cx,
                                                        cy=test.cy))

    segments = db.execute_statement(cfg=app.cfg,
                                    stmt=db.select_segments(cfg=app.cfg,
                                                            cx=test.cx,
                                                            cy=test.cy))
    assert response.status == '400 BAD REQUEST'
    assert get('cx', response.get_json()) == cx
    assert get('cy', response.get_json()) == cy
    assert get('acquired', response.get_json()) == a
    assert type(get('exception', response.get_json())) is str
    assert len(get('exception', response.get_json())) > 0

    assert len(list(map(lambda x: x, chips))) == 0
    assert len(list(map(lambda x: x, pixels))) == 0
    assert len(list(map(lambda x: x, segments))) == 0
Exemple #28
0
    def _get_json(self, key):
        o = self.client.get_object(Bucket=self.bucket_name, Key=key)

        if get('ContentEncoding', o, None) == 'gzip':
            v = gzip.decompress(o['Body'].read()).decode('utf-8')
        else:
            v = o['Body'].read().decode('utf-8')

        return json.loads(v)
Exemple #29
0
def save(ctx, cfg):

    if get('test_cassandra_exception', ctx, None) is not None:
        raise Exception('test_cassandra_exception')
    else:
        save_chip(ctx, cfg)
        save_pixels(ctx, cfg)
        save_segments(ctx, cfg)
        return ctx
Exemple #30
0
    def assemble(pair):
        a, b = pair
        if a is not None:
            joined = get(on_left, a)
        else:
            joined = get(on_right, b)

        if a is not None:
            left_entries = get(left_self_columns, a)
        else:
            left_entries = (None, ) * (len(t.lhs.fields) - len(on_left))

        if b is not None:
            right_entries = get(right_self_columns, b)
        else:
            right_entries = (None, ) * (len(t.rhs.fields) - len(on_right))

        return joined + left_entries + right_entries
Exemple #31
0
    def _get_bin(self, key):
        o = self.client.get_object(Bucket=self.bucket_name, Key=key)

        if get('ContentEncoding', o, None) == 'gzip':
            v = gzip.decompress(o['Body'].read())
        else:
            v = o['Body'].read()

        return v
Exemple #32
0
    def assemble(pair):
        a, b = pair
        if a is not None:
            joined = get(on_left, a)
        else:
            joined = get(on_right, b)

        if a is not None:
            left_entries = get(left_self_columns, a)
        else:
            left_entries = (None,) * (len(t.lhs.fields) - len(on_left))

        if b is not None:
            right_entries = get(right_self_columns, b)
        else:
            right_entries = (None,) * (len(t.rhs.fields) - len(on_right))

        return joined + left_entries + right_entries
Exemple #33
0
def rowfunc(t):
    """ Rowfunc provides a function that can be mapped onto a sequence.

    >>> accounts = TableSymbol('accounts', '{name: string, amount: int}')
    >>> f = rowfunc(accounts['amount'])

    >>> row = ('Alice', 100)
    >>> f(row)
    100

    See Also:
        compute<Rowwise, Sequence>
    """
    from toolz.curried import get
    indices = [t.parent.columns.index(col) for col in t.columns]
    return get(indices)
Exemple #34
0
	def tagRawSentence(self, rawLine, DICT, word_dict, pos_dict):
		line = initializeSentence(DICT, rawLine)
		sen = []
		wordTags = line.split()
		for i in range(len(wordTags)):
			fwObject = FWObject.getFWObject(wordTags, i)
			word, tag = getWordTag(wordTags[i])
			node = self.findFiredNode(fwObject)
			if node.depth > 0:
				current_dict = ct.get(word.lower(), word_dict, default = 0)
				if current_dict == 0:
					sen.append((0, ct.get(node.conclusion.lower(), pos_dict, default = 0), 0))
				else:
					sen.append((ct.get("index", current_dict), ct.get(node.conclusion.lower(), pos_dict, default = 0), ct.get("domain", current_dict)))
			else:# Fired at root, return initialized tag
				current_dict = ct.get(word.lower(), word_dict, default = 0)
				if current_dict == 0:
					sen.append((0, ct.get(tag.lower(), pos_dict), 0))
				else:
					sen.append((ct.get("index", current_dict), ct.get(tag.lower(), pos_dict, default = 0), ct.get("domain", current_dict)))
		return sen
Exemple #35
0
def parent_signin():
  barcode = request.forms['barcode']
  students = request.forms.getlist('students')

  u = filter(lambda v: v.id == barcode, data['users'].values())
  if len(u) < 0:
    return template('signin', message='Barcode ' + barcode +
                    ' not recognized.')

  u = u[0]
  # check auth
  a = [u.name in s.authorized for s in t.get(students, data['students'])]
  if False in a:
    return template('signin', message='Not authorized for all students.')

  for s in students:
    data['students'][s].in_class = False

  return template('success', students=str(students), in_out='out')
Exemple #36
0
 def get(self, ind, default=None):
     return cytoolz.get(ind, self, default)
Exemple #37
0
def _do_fit_step(dsk, next_token, step, cv, fields, tokens, params, Xs, ys,
                 fit_params, n_splits, error_score, step_fields_lk,
                 fit_params_lk, field_to_index, step_name, none_passthrough,
                 is_transform):
    sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2))
    sub_fit_params = fit_params_lk[step_name]

    if step_name in field_to_index:
        # The estimator may change each call
        new_fits = {}
        new_Xs = {}
        est_index = field_to_index[step_name]

        for ids in _group_ids_by_index(est_index, tokens):
            # Get the estimator for this subgroup
            sub_est = params[ids[0]][est_index]
            if sub_est is MISSING:
                sub_est = step

            # If an estimator is `None`, there's nothing to do
            if sub_est is None:
                nones = dict.fromkeys(ids, None)
                new_fits.update(nones)
                if is_transform:
                    if none_passthrough:
                        new_Xs.update(zip(ids, get(ids, Xs)))
                    else:
                        new_Xs.update(nones)
            else:
                # Extract the proper subset of Xs, ys
                sub_Xs = get(ids, Xs)
                sub_ys = get(ids, ys)
                # Only subset the parameters/tokens if necessary
                if sub_fields:
                    sub_tokens = list(pluck(sub_inds, get(ids, tokens)))
                    sub_params = list(pluck(sub_inds, get(ids, params)))
                else:
                    sub_tokens = sub_params = None

                if is_transform:
                    sub_fits, sub_Xs = do_fit_transform(dsk, next_token,
                                                        sub_est, cv, sub_fields,
                                                        sub_tokens, sub_params,
                                                        sub_Xs, sub_ys,
                                                        sub_fit_params,
                                                        n_splits, error_score)
                    new_Xs.update(zip(ids, sub_Xs))
                    new_fits.update(zip(ids, sub_fits))
                else:
                    sub_fits = do_fit(dsk, next_token, sub_est, cv,
                                      sub_fields, sub_tokens, sub_params,
                                      sub_Xs, sub_ys, sub_fit_params,
                                      n_splits, error_score)
                    new_fits.update(zip(ids, sub_fits))
        # Extract lists of transformed Xs and fit steps
        all_ids = list(range(len(Xs)))
        if is_transform:
            Xs = get(all_ids, new_Xs)
        fits = get(all_ids, new_fits)
    elif step is None:
        # Nothing to do
        fits = [None] * len(Xs)
        if not none_passthrough:
            Xs = fits
    else:
        # Only subset the parameters/tokens if necessary
        if sub_fields:
            sub_tokens = list(pluck(sub_inds, tokens))
            sub_params = list(pluck(sub_inds, params))
        else:
            sub_tokens = sub_params = None

        if is_transform:
            fits, Xs = do_fit_transform(dsk, next_token, step, cv,
                                        sub_fields, sub_tokens, sub_params,
                                        Xs, ys, sub_fit_params, n_splits,
                                        error_score)
        else:
            fits = do_fit(dsk, next_token, step, cv, sub_fields,
                          sub_tokens, sub_params, Xs, ys, sub_fit_params,
                          n_splits, error_score)
    return (fits, Xs) if is_transform else (fits, None)
Exemple #38
0
def _do_featureunion(dsk, next_token, est, cv, fields, tokens, params, Xs, ys,
                     fit_params, n_splits, error_score):
    if 'transformer_list' in fields:
        raise NotImplementedError("Setting FeatureUnion.transformer_list "
                                  "in a gridsearch")

    (field_to_index,
     step_fields_lk) = _group_subparams(est.transformer_list, fields,
                                        ignore=('transformer_weights'))
    fit_params_lk = _group_fit_params(est.transformer_list, fit_params)

    token = next_token(est)

    n_samples = _do_n_samples(dsk, token, Xs, n_splits)

    fit_steps = []
    tr_Xs = []
    for (step_name, step) in est.transformer_list:
        fits, out_Xs = _do_fit_step(dsk, next_token, step, cv, fields, tokens,
                                    params, Xs, ys, fit_params, n_splits,
                                    error_score, step_fields_lk, fit_params_lk,
                                    field_to_index, step_name, False, True)
        fit_steps.append(fits)
        tr_Xs.append(out_Xs)

    # Rebuild the FeatureUnions
    step_names = [n for n, _ in est.transformer_list]

    if 'transformer_weights' in field_to_index:
        index = field_to_index['transformer_weights']
        weight_lk = {}
        weight_tokens = list(pluck(index, tokens))
        for i, tok in enumerate(weight_tokens):
            if tok not in weight_lk:
                weights = params[i][index]
                if weights is MISSING:
                    weights = est.transformer_weights
                lk = weights or {}
                weight_list = [lk.get(n) for n in step_names]
                weight_lk[tok] = (weights, weight_list)
        weights = get(weight_tokens, weight_lk)
    else:
        lk = est.transformer_weights or {}
        weight_list = [lk.get(n) for n in step_names]
        weight_tokens = repeat(None)
        weights = repeat((est.transformer_weights, weight_list))

    out = []
    out_append = out.append
    fit_name = 'feature-union-' + token
    tr_name = 'feature-union-concat-' + token
    m = 0
    seen = {}
    for steps, Xs, wt, (w, wl), nsamp in zip(zip(*fit_steps), zip(*tr_Xs),
                                             weight_tokens, weights, n_samples):
        if (steps, wt) in seen:
            out_append(seen[steps, wt])
        else:
            for n in range(n_splits):
                dsk[(fit_name, m, n)] = (feature_union, step_names,
                                         [None if s is None else s + (n,)
                                          for s in steps], w)
                dsk[(tr_name, m, n)] = (feature_union_concat,
                                        [None if x is None else x + (n,)
                                         for x in Xs], nsamp + (n,), wl)
            seen[steps, wt] = m
            out_append(m)
            m += 1
    return [(fit_name, i) for i in out], [(tr_name, i) for i in out]