Example #1
0
def train():
    x_lines = [
        *toolz.take(LIMIT,
                    open('data/x.txt').read().lower().split('\n'))
    ]
    y_lines = [
        *toolz.take(LIMIT,
                    open('data/y.txt').read().lower().split('\n'))
    ]

    encoder = encoder_for_lines(S2S_PARAMS, x_lines + y_lines)

    try:
        start_idx = encoder.word_vocab[S2S_PARAMS.start_token]
        pad_idx = encoder.word_vocab[PAD_TOKEN]
    except AttributeError:
        start_idx = int(encoder.vocabulary_[S2S_PARAMS.start_token])
        pad_idx = encoder.vocabulary_[PAD_TOKEN]

    reverse_enc = {idx: word for word, idx in encoder.vocabulary_.items()}
    model = build_model(S2S_PARAMS, start_idx, pad_idx)

    x = encode_data(encoder, x_lines, is_input=True)
    y = encode_data(encoder, y_lines, is_input=False)

    print(x.shape, y.shape)

    x = x[:S2S_PARAMS.batch_size * int(len(x) / S2S_PARAMS.batch_size)]
    y = y[:S2S_PARAMS.batch_size * int(len(y) / S2S_PARAMS.batch_size)]

    test_x = x[:S2S_PARAMS.batch_size]
    losses = []

    if USE_COMET:
        experiment = Experiment(api_key="DQqhNiimkjP0gK6c8iGz9orzL",
                                log_code=True)
        experiment.log_multiple_params(S2S_PARAMS._asdict())
        for idx in range(1000):
            print("Shuffling data...")
            random_idx = random.sample([*range(len(x))], len(x))
            x = x[random_idx]
            y = y[random_idx]
            print("Training in epoch " + str(idx))
            losses.append(model.train_epoch(x, y, experiment=experiment))
            experiment.log_epoch_end(idx)
            print('Loss history: {}'.format(', '.join(
                ['{:.4f}'.format(loss) for loss in losses])))
            test_y = model.predict(test_x)
            for i in range(min([3, S2S_PARAMS.batch_size])):
                print('> ' + ' '.join(
                    reverse_enc.get(idx, '<unk/>') for idx in list(test_y[i])))
    else:
        for idx in range(1000):
            print("Training in epoch " + str(idx))
            model.train_epoch(x, y)
Example #2
0
File: ssh.py Project: MoherX/odo
def sample_ssh(data, lines=500):
    """ Grab a few lines from the remote file """
    with tmpfile() as fn:
        with open(fn, 'w') as f:
            for line in take(lines, data.lines()):
                f.write(line)
        yield fn
Example #3
0
def create_merge_tree(func, keys, token):
    """Create a task tree that merges all the keys with a reduction function.

    Parameters
    ----------
    func: callable
        Reduction function that accepts a single list of values to reduce.
    keys: iterable
        Keys to reduce from the source dask graph.
    token: object
        Included in each key of the returned dict.

    This creates a k-ary tree where k depends on the current level and is
    greater the further away a node is from the root node.  This reduces the
    total number of nodes (thereby reducing scheduler overhead), but still
    has beneficial properties of trees.

    For reasonable numbers of keys, N < 1e5, the total number of nodes in the
    tree is roughly ``N**0.78``.  For 1e5 < N < 2e5, is it roughly ``N**0.8``.
    """
    level = 0
    prev_width = len(keys)
    prev_keys = iter(keys)
    rv = {}
    while prev_width > 1:
        width = tree_width(prev_width)
        groups = tree_groups(prev_width, width)
        keys = [(token, level, i) for i in range(width)]
        rv.update((key, (func, list(take(num, prev_keys))))
                  for num, key in zip(groups, keys))
        prev_width = width
        prev_keys = iter(keys)
        level += 1
    return rv
Example #4
0
def run(n, x, *goals, results_filter=None):
    """Run a logic program and obtain n solutions that satisfy the given goals.

    >>> from kanren import run, var, eq
    >>> x = var()
    >>> run(1, x, eq(x, 1))
    (1,)

    Parameters
    ----------
    n: int
        The number of desired solutions. `n=0` returns a tuple
        with all results and `n=None` returns a lazy sequence of all results.
    x: object
        The form to reify and output.  Usually contains logic variables used in
        the given goals.
    goals: Callables
        A sequence of goals that must be true in logical conjunction
        (i.e. `lall`).
    results_filter: Callable
        A function to apply to the results stream (e.g. a `unique` filter).
    """
    results = map(partial(reify, x), lall(*goals)({}))

    if results_filter is not None:
        results = results_filter(results)

    if n is None:
        return results
    elif n == 0:
        return tuple(results)
    else:
        return tuple(take(n, results))
def test_map(client):
    with client.get_executor() as e:
        N = 10
        it = e.map(inc, range(N))
        expected = set(range(1, N + 1))
        for x in it:
            expected.remove(x)
        assert not expected

    with client.get_executor(pure=False) as e:
        N = 10
        it = e.map(slowinc, range(N), [0.1] * N, timeout=0.4)
        results = []
        with pytest.raises(TimeoutError):
            for x in it:
                results.append(x)
        assert 2 <= len(results) < 7

    with client.get_executor(pure=False) as e:
        N = 10
        # Not consuming the iterator will cancel remaining tasks
        it = e.map(slowinc, range(N), [0.1] * N)
        for x in take(2, it):
            pass
        # Some tasks still processing
        assert number_of_processing_tasks(client) > 0
        # Garbage collect the iterator => remaining tasks are cancelled
        del it
        assert number_of_processing_tasks(client) == 0
Example #6
0
def main():

    print("mining started")

    collective_match_data = []
    try:
        top_players = get_live_players()

        for player_id in top_players:
            try:
                matches = api.get_match_history(player_id)
            except:
                continue
            match_ids = parse_match_ids_from_player_data(matches)

            for match_id in list(take(3, match_ids)):
                match_details = api.get_match_details(match_id)
                collective_match_data.append(match_details)

        minified = minify_data(collective_match_data)
        insert_many(minified)
    except Exception as e:
        print(e)

    print("miner added " + str(len(collective_match_data)) + " new matches")
Example #7
0
def convert_bars(r: Tuple, cutoff: int = 10) -> Tuple:
    """Reduce number of CHARs in a BAR expression to the cutoff number."""

    ty = r[0]

    if ty == BAR:
        tail_converted = tuple(map(convert_bars, r[1:]))

        grouped = toolz.groupby(lambda x: classify_char(x[1]) if x[0] == PToken.CHAR else CharSet.OTHER,
                                tail_converted)

        if CharSet.WORD in grouped and cutoff < len(grouped[CharSet.WORD]):
            logger.info('reducing WORD choices')
            grouped[CharSet.WORD] = toolz.take(cutoff, grouped[CharSet.WORD])

        return (PToken.BAR,) + tuple(toolz.concat(grouped.values()))
    elif ty == PToken.BACKREF:
        raise NotImplementedError
    elif ty == PToken.CHAR:
        return r
    elif ty == PToken.GROUP:
        return tuple(convert_bars(elt) if ii > 1 else elt
                     for ii, elt in enumerate(r))
    else:
        return tuple(convert_bars(elt) if ii > 0 else elt
                     for ii, elt in enumerate(r))
def test_map(client):
    with client.get_executor() as e:
        N = 10
        it = e.map(inc, range(N))
        expected = set(range(1, N + 1))
        for x in it:
            expected.remove(x)
        assert not expected

    with client.get_executor(pure=False) as e:
        N = 10
        it = e.map(slowinc, range(N), [0.1] * N, timeout=0.4)
        results = []
        with pytest.raises(TimeoutError):
            for x in it:
                results.append(x)
        assert 2 <= len(results) < 7

    with client.get_executor(pure=False) as e:
        N = 10
        # Not consuming the iterator will cancel remaining tasks
        it = e.map(slowinc, range(N), [0.1] * N)
        for x in take(2, it):
            pass
        # Some tasks still processing
        assert number_of_processing_tasks(client) > 0
        # Garbage collect the iterator => remaining tasks are cancelled
        del it
        assert number_of_processing_tasks(client) == 0
Example #9
0
def create_merge_tree(func, keys, token):
    """Create a task tree that merges all the keys with a reduction function.

    Parameters
    ----------
    func: callable
        Reduction function that accepts a single list of values to reduce.
    keys: iterable
        Keys to reduce from the source dask graph.
    token: object
        Included in each key of the returned dict.

    This creates a k-ary tree where k depends on the current level and is
    greater the further away a node is from the root node.  This reduces the
    total number of nodes (thereby reducing scheduler overhead), but still
    has beneficial properties of trees.

    For reasonable numbers of keys, N < 1e5, the total number of nodes in the
    tree is roughly ``N**0.78``.  For 1e5 < N < 2e5, is it roughly ``N**0.8``.
    """
    level = 0
    prev_width = len(keys)
    prev_keys = iter(keys)
    rv = {}
    while prev_width > 1:
        width = tree_width(prev_width)
        groups = tree_groups(prev_width, width)
        keys = [(token, level, i) for i in range(width)]
        rv.update((key, (func, list(take(num, prev_keys))))
                   for num, key in zip(groups, keys))
        prev_width = width
        prev_keys = iter(keys)
        level += 1
    return rv
Example #10
0
def read_events_in_batch(config, batch_id, batch):
    first_rec = batch.iloc[0]
    asof_dt = first_rec.asof_dt
    bucket = config['VIDEO_END_BUCKET']
    print(f'>> start event download,batch_id={batch_id}')
    limit_events_per_batch = config.get("LIMIT_EVENTS_PER_BATCH")
    if limit_events_per_batch is None:
        print(
            f">> WARNING: Limiting events to no more than {limit_events_per_batch} events per batch"
        )
    s3 = get_client()

    def download_events(name):
        print(f">>downloading {name}")

        filename = Path(name).name

        retr = s3.get_object(Bucket=bucket, Key=str(name))
        reader = retr['Body'].iter_lines()
        reader = map(safe_json_loads, reader)
        reader = (merge(x, {
            'file_idx': file_idx,
            'file': filename,
            'asof_dt': asof_dt
        }) for file_idx, x in enumerate(reader))
        return reader

    reader = map(download_events, batch.name)
    reader = concat(reader)
    if limit_events_per_batch is not None:
        reader = take(limit_events_per_batch, reader)

    return batch_id, reader
Example #11
0
    def __init__(self,
                 path,
                 mode='rt',
                 schema=None,
                 columns=None,
                 types=None,
                 typehints=None,
                 dialect=None,
                 header=None,
                 open=open,
                 nrows_discovery=50,
                 chunksize=1024,
                 encoding=DEFAULT_ENCODING,
                 **kwargs):
        if 'r' in mode and not os.path.isfile(path):
            raise ValueError('CSV file "%s" does not exist' % path)

        if schema is None and 'w' in mode:
            raise ValueError('Please specify schema for writable CSV file')

        self.path = path
        self.mode = mode
        self.open = {'gz': gzip.open, 'bz2': bz2.BZ2File}.get(ext(path), open)
        self._abspath = os.path.abspath(path)
        self.chunksize = chunksize
        self.encoding = encoding

        sample = get_sample(self)
        self.dialect = dialect = discover_dialect(sample, dialect, **kwargs)

        if header is None:
            header = has_header(sample, encoding=encoding)
        elif isinstance(header, int):
            header = True
        self.header = header

        if not schema and 'w' not in mode:
            schema = discover_csv(path,
                                  encoding=encoding,
                                  dialect=dialect,
                                  header=self.header,
                                  typehints=typehints,
                                  types=types,
                                  columns=columns,
                                  nrows_discovery=nrows_discovery)
        self._schema = schema
        self.header = header

        if 'w' not in mode:
            try:
                nd.array(list(take(10, self._iter(chunksize=10))),
                         dtype=str(schema))
            except (ValueError, TypeError) as e:
                raise ValueError("Automatic datashape discovery failed\n"
                                 "Discovered the following datashape: %s\n"
                                 "But DyND generated the following error: %s\n"
                                 "Consider providing type hints using "
                                 "typehints={'column-name': 'type'}\n"
                                 "like typehints={'start-time': 'string'}" %
                                 (schema, e.args[0]))
Example #12
0
def test_local_client(loop):
    def produce(n):
        with local_client() as c:
            x = c.channel('x')
            for i in range(n):
                future = c.submit(slowinc, i, delay=0.01, key='f-%d' % i)
                x.append(future)

            x.flush()

    def consume():
        with local_client() as c:
            x = c.channel('x')
            y = c.channel('y')
            last = 0
            for i, future in enumerate(x):
                last = c.submit(add, future, last, key='add-' + future.key)
                y.append(last)

    with cluster() as (s, [a, b]):
        with Client(('127.0.0.1', s['port']), loop=loop) as c:
            x = c.channel('x')
            y = c.channel('y')

            producers = (c.submit(produce, 5), c.submit(produce, 10))
            consumer = c.submit(consume)

            results = []
            for i, future in enumerate(take(15, y)):
                result = future.result()
                results.append(result)

            assert len(results) == 15
            assert all(0 < r < 100 for r in results)
Example #13
0
def sample_ssh(data, lines=500):
    """ Grab a few lines from the remote file """
    with tmpfile() as fn:
        with open(fn, 'w') as f:
            for line in take(lines, data.lines()):
                f.write(line)
        yield fn
Example #14
0
def main(config_f=None, overrides=None):
    config = get_config(config_f=config_f, overrides=overrides)
    print(f">> start extract events, config={config}")

    batch_limit = config.get('BATCH_LIMIT')
    partitions_d = Path(config['PARTITIONS_D'])
    write_partition = get_partition_writer(partitions_d=partitions_d)

    batches = read_event_batches(config)
    if batch_limit:
        print(
            f">> WARNING: limiting run to no more than {batch_limit} batches")
        batches = take(batch_limit, batches)

    reader = (read_events_in_batch(config=config,
                                   batch_id=batch_id,
                                   batch=batch) for batch_id, batch in batches)

    reader = (parse_events_in_batch(batch_id=batch_id, reader=rdr)
              for batch_id, rdr in reader)

    reader = (write_partition(batch_id, rdr) for batch_id, rdr in reader)

    for x in reader:
        pass
    print(">> end proc events")
Example #15
0
def forcastall(intid):
    data=map(int,read_artist(intid)["action_1"])
    sun=training(data,4)
    fun=toolz.compose(str,int)
    
    predictdata=map(fun,toolz.take(60,sun))    #focast 60 days
    with open("./past_forcast/{aid}.csv".format(aid=intid),"wt") as f:
        f.write(",".join(predictdata))
Example #16
0
def song_info(artist, title):
    if title is u'':
        print("Searching for '%s'" % artist)
        result = song.search(combined=artist)
    else:
        print("Searching for '%s - %s'" % (artist, title))
        result = song.search(artist=artist, title=title)
    print_search_results(take(3, result))
Example #17
0
    async def rotate_tokens(self):
        # try each token, query its rate limit
        # if none of them works log and sleep
        for token in toolz.take(self._n_tokens, self._tokens):
            remaining = await self.rate_limit(token)

            if remaining > self._rotate_at:
                return self._set_token(token)
Example #18
0
def _is_from_ncbi(gff3_file):
    with open(gff3_file) as in_handle:
        for line in tz.take(10000, in_handle):
            if "Dbxref" in line:
                return "Dbxref"
            if "db_xref" in line:
                return "db_xref"
    return None
def _is_from_ncbi(gff3_file):
    with open(gff3_file) as in_handle:
        for line in tz.take(10000, in_handle):
            if "Dbxref" in line:
                return "Dbxref"
            if "db_xref" in line:
                return "db_xref"
    return None
Example #20
0
 def create_categories(width, plus_one):
     length = int(width / 8) + plus_one
     return [
         ''.join(cs) for cs in take(
             2**width + plus_one,
             product([chr(c) for c in range(256)], repeat=length),
         )
     ]
Example #21
0
 def create_categories(width, plus_one):
     length = int(width / 8) + plus_one
     return [
         ''.join(cs)
         for cs in take(
             2 ** width + plus_one,
             product([chr(c) for c in range(256)], repeat=length),
         )
     ]
Example #22
0
File: bson.py Project: daskos/epos
def discover_bson(b, n=10, **kwargs):
    with bson_lines(b.path) as lines:
        data = list(take(n, lines))

    if len(data) < n:
        ds = discover(data)
    else:
        ds = var * discover(data).subshape[0]
    return ds
Example #23
0
File: test.py Project: keshava/dlsi
def _evaluate_split(split,
                    section_aug,
                    model,
                    device,
                    running_metrics_overall,
                    config,
                    debug=False):
    logger = logging.getLogger(__name__)

    TestSectionLoader = get_test_loader(config)
    test_set = TestSectionLoader(
        data_dir=config.DATASET.ROOT,
        split=split,
        is_transform=True,
        augmentations=section_aug,
    )

    n_classes = test_set.n_classes

    test_loader = data.DataLoader(test_set,
                                  batch_size=1,
                                  num_workers=config.WORKERS,
                                  shuffle=False)
    if debug:
        logger.info("Running in Debug/Test mode")
        test_loader = take(1, test_loader)

    running_metrics_split = runningScore(n_classes)

    # testing mode:
    with torch.no_grad():  # operations inside don't track history
        model.eval()
        total_iteration = 0
        for i, (images, labels) in enumerate(test_loader):
            logger.info(f"split: {split}, section: {i}")
            total_iteration = total_iteration + 1

            outputs = model(images.to(device))

            pred = outputs.detach().max(1)[1].cpu().numpy()
            gt = labels.numpy()
            running_metrics_split.update(gt, pred)
            running_metrics_overall.update(gt, pred)

    # get scores
    score, class_iou = running_metrics_split.get_scores()

    # Log split results
    logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.3f}')
    for cdx, class_name in enumerate(_CLASS_NAMES):
        logger.info(
            f'  {class_name}_accuracy {score["Class Accuracy: "][cdx]:.3f}')

    logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.3f}')
    logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.3f}')
    logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}')
    running_metrics_split.reset()
Example #24
0
def speed_test_buffer(buffersize):
    t1 = time.time()
    n_iter = 10_000_000
    gen = read_binary_bus(B1.bus_file, decode_seq=False, buffersize=buffersize)
    gen = toolz.take(n_iter, gen)
    for a in gen:
        pass
    t2 = time.time()
    return t2 - t1
Example #25
0
def fit(cost_f, cost_df, h_theta0, data, eta=0.1, it_max=500, gf='gd'):
    '''
    Compute values of multiple linear regression coefficients
    Parameters
        cost_f: Cost function (J)
        cost_df: gradient of cost function (gradJ for batch and gradJS for stochastic)
        h_theta0: initial guess for fitting parameters (j cols)
        data: list of tuples [(Xi, yi)]
        X: matrix of independent variables (i rows of observations and j cols of variables). x0=1 for all i
        y: dependent variable (i rows)
        eta: learning rate
        it_max: maximum number of iterations
    Returns
        Fitting parameters (j cols)
    '''
    X, y = zip(*data)
    if gf == 'gd':
        f = partial(cost_f, X, y)
        df = partial(cost_df, X, y)
        ans = list(
            take(it_max,
                 ((h_theta, f(h_theta))
                  for h_theta in fgd.gradient_descent(df, h_theta0, eta=eta))))
        value = list(T(ans)[0])
        cost = list(T(ans)[1])
        #t = list(until_within_tol(cost, 1e-7))
        return value[-1], cost
    elif gf == 'sgd':
        df = cost_df
        cost = [sum(cost_f(xi, yi, h_theta0) for xi, yi in data)]
        h_theta = h_theta0
        eta_new = eta
        for _ in xrange(it_max):
            ans = list(
                take(len(y),
                     (e for e in fgd.sgd(df, X, y, h_theta, eta=eta_new))))
            h_theta = ans[-1]
            cost.append(sum(cost_f(xi, yi, h_theta) for xi, yi in data))
            eta_new = 0.99 * eta_new
        return h_theta, cost
    else:
        print('Not a valid function')
        return
Example #26
0
def collect(grouper, npartitions, group, pbags):
    """ Collect partitions from disk and yield k,v group pairs """
    from pbag import PBag
    pbags = list(take(npartitions, pbags))
    result = defaultdict(list)
    for pb in pbags:
        part = pb.get_partition(group)
        groups = groupby(grouper, part)
        for k, v in groups.items():
            result[k].extend(v)
    return list(result.items())
Example #27
0
def discover(coll, n=50):
    items = list(take(n, coll.find()))
    for item in items:
        del item['_id']

    ds = discover(items)

    if isdimension(ds[0]):
        return coll.count() * ds.subshape[0]
    else:
        raise ValueError("Consistent datashape not found")
Example #28
0
File: mongo.py Project: vitan/blaze
def discover(coll, n=50):
    items = list(take(n, coll.find()))
    for item in items:
        del item['_id']

    ds = discover(items)

    if isdimension(ds[0]):
        return coll.count() * ds.subshape[0]
    else:
        raise ValueError("Consistent datashape not found")
Example #29
0
def plot_lrates(f, df, x0, etas, niter):
    fig, ax = plt.subplots(nrows=1, ncols=1)
    for eta in etas: 
        ax.plot(list(xrange(1, niter + 1)),
                    list(take(niter,(f(e) for e in gradient_descent(df, x0, eta=eta)))), 
                     label=unicode(eta))
    ax.set_xlabel('Iteration Number')
    ax.set_ylabel('f(x)')
    plt.legend(title='Learning Rate')
    plt.show()
    plt.clf()
Example #30
0
File: core.py Project: esc/dask
def collect(grouper, npartitions, group, pbags):
    """ Collect partitions from disk and yield k,v group pairs """
    from pbag import PBag
    pbags = list(take(npartitions, pbags))
    result = defaultdict(list)
    for pb in pbags:
        part = pb.get_partition(group)
        groups = groupby(grouper, part)
        for k, v in groups.items():
            result[k].extend(v)
    return list(result.items())
Example #31
0
 def get_selected_indices(self):
     indices = range(self.len_inp * self.out_inp_factor)
     num_extra_elems = self.out_inp_factor * self.len_inp - self.len_out
     selected_groups = set(
         np.random.choice(self.len_inp, num_extra_elems, replace=False))
     selected_indices = list(
         concat(
             take(self.out_inp_factor -
                  1, group) if i in selected_groups else group
             for i, group in enumerate(
                 partition(self.out_inp_factor, indices))))
     return selected_indices
def fit(cost_f, cost_df, h_theta0, data, eta=0.1, it_max=500, gf='gd'):
    '''
    Compute values of multiple linear regression coefficients
    Parameters
        cost_f: Cost function (J)
        cost_df: gradient of cost function (gradJ for batch and gradJS for stochastic)
        h_theta0: initial guess for fitting parameters (j cols)
        data: list of tuples [(Xi, yi)]
        X: matrix of independent variables (i rows of observations and j cols of variables). x0=1 for all i
        y: dependent variable (i rows)
        eta: learning rate
        it_max: maximum number of iterations
    Returns
        Fitting parameters (j cols)
    '''
    X, y = zip(*data)
    if gf == 'gd':
        f = partial(cost_f, X, y)
        df = partial(cost_df, X, y) 
        ans = list(take(it_max, 
                        ((h_theta, f(h_theta)) for h_theta in 
                          fgd.gradient_descent(df, h_theta0, eta=eta))))
        value = list(T(ans)[0])
        cost = list(T(ans)[1])
        #t = list(until_within_tol(cost, 1e-7))
        return value[-1], cost 
    elif gf == 'sgd':
        df = cost_df
        cost = [sum(cost_f(xi, yi, h_theta0) for xi, yi in data)]
        h_theta = h_theta0
        eta_new = eta
        for _ in xrange(it_max):
            ans = list(take(len(y), (e for e in fgd.sgd(df, X, y, h_theta, eta=eta_new))))
            h_theta = ans[-1]
            cost.append(sum(cost_f(xi, yi, h_theta) for xi, yi in data))
            eta_new = 0.99 * eta_new
        return h_theta, cost
    else:
        print('Not a valid function')
        return    
Example #33
0
def discover_pymongo_collection(coll, n=50):
    items = list(take(n, coll.find()))
    oid_cols = [k for k, v in items[0].items() if isinstance(v, ObjectId)]
    for item in items:
        for col in oid_cols:
            del item[col]

    ds = discover(items)

    if isdimension(ds[0]):
        return coll.count() * ds.subshape[0]
    else:
        raise ValueError("Consistent datashape not found")
Example #34
0
def plot_lrates(f, df, x0, etas, niter):
    fig, ax = plt.subplots(nrows=1, ncols=1)
    for eta in etas:
        ax.plot(list(xrange(1, niter + 1)),
                list(
                    take(niter,
                         (f(e) for e in gradient_descent(df, x0, eta=eta)))),
                label=unicode(eta))
    ax.set_xlabel('Iteration Number')
    ax.set_ylabel('f(x)')
    plt.legend(title='Learning Rate')
    plt.show()
    plt.clf()
def test_cast_string_to_date(alltypes, df, type):
    import toolz

    string_col = alltypes.date_string_col
    month, day, year = toolz.take(3, string_col.split('/'))

    expr = '20' + ibis.literal('-').join([year, month, day])
    expr = expr.cast(type)
    result = expr.execute().astype('datetime64[ns]').sort_values().reset_index(
        drop=True).rename('date_string_col')
    expected = pd.to_datetime(
        df.date_string_col).dt.normalize().sort_values().reset_index(drop=True)
    tm.assert_series_equal(result, expected)
Example #36
0
    def transform(self, X, y):
        positives = []
        negatives = []
        items = groupby(lambda i: y[i], range(len(y))).items()

        for label, group_ix in items:

            possible_positives = len(group_ix)**2
            if self.num_positive >= possible_positives:
                raise ValueError(
                    'Not enough combinations for positive examples')

            group_ix = np.random.permutation(group_ix)
            positives.extend(
                take(
                    self.num_positive,
                    itertools.product(group_ix,
                                      np.random.permutation(group_ix))))

            other_candidates_ix = np.concatenate([
                group_ix for label_neg, group_ix in items if label_neg != label
            ])

            possible_negatives = len(other_candidates_ix) * len(group_ix)
            if self.num_negative >= possible_negatives:
                raise ValueError(
                    'Not enough combinations for negative examples')

            negatives.extend(
                take(
                    self.num_negative,
                    itertools.product(
                        group_ix, np.random.permutation(other_candidates_ix))))

        all_pairs = positives + negatives
        all_output = [1] * len(positives) + [0] * len(negatives)

        return X.take(all_pairs, axis=0), all_output
Example #37
0
def _limit_inlines(max_inlines, images_iter):
    if max_inlines is not None:
        images_list = list(images_iter)
        if max_inlines > len(images_list):
            warn_msg = (f"The number of max inlines {max_inlines} is greater"
                        f"than the number of inlines found {len(images_list)}."
                        f"Setting max inlines to {len(images_list)}")
            warnings.warning(warn_msg)
            max_inlines = len(images_list)
            images_iter = images_list
        else:
            shuffled_list = random.shuffle(images_list)
            images_iter = take(max_inlines, shuffled_list)
    return images_iter, max_inlines
Example #38
0
def get_dirs_and_files_in_path(path):
    # filter function
    def isdir(a): return os.path.isdir(a)
    # gives the opposite results as above
    not_isdir = toolz.complement(isdir)

    if not path and platform.system() == 'Windows':
        import win32api
        drives = win32api.GetLogicalDriveStrings()
        drives = [d for d in drives.split('\000') if d]
        return drives

    elif os.path.exists(path):
        r = os.listdir(path)
        # 2x acccess means I have to remove the generator
        f = [os.path.join(path, a) for a in r]
        dirs = filter(isdir, f)
        files = filter(not_isdir, f)

    else:
        try:
            head, tail = os.path.split(path)
            r = os.listdir(head)
            filtered_everything = filter(lambda a: a.startswith(tail), r)
            # because this was accesssed twice, I needed to remove the generator
            filtered_everything = [os.path.join(head, a) for a in filtered_everything]
            dirs = filter(isdir, filtered_everything)
            files = filter(not_isdir, filtered_everything)

        except Exception as e:
            print('{0} doesn\'t even exist you stupid'.format(head))
            return None

    result = (sorted(list(toolz.take(100, dirs))),
              sorted(list(toolz.take(100, files))))
    return result
Example #39
0
def _solve(print=print):
    print('This isn'
          't really done. Groups of 4 works ok, but 5s take forever.')
    print('Coming up with a lazy version of combinations would be better')

    with mp.Pool(4) as pool:
        cs = combinations(take(200, primes()), 4)

        for remarkable, combo in pool.imap_unordered(is_remarkable_2,
                                                     cs,
                                                     chunksize=125):
            if remarkable:
                print(combo)

    return False
def test_cast_string_to_date(alltypes, df, type):
    import toolz

    string_col = alltypes.date_string_col
    month, day, year = toolz.take(3, string_col.split("/"))

    expr = ibis.literal("-").join([year, month, day])
    expr = expr.cast(type)

    result = (expr.execute().iloc[:, 0].astype("datetime64[ns]").sort_values().
              reset_index(drop=True).rename("date_string_col"))
    expected = (pd.to_datetime(
        df.date_string_col).dt.normalize().sort_values().reset_index(
            drop=True))
    tm.assert_series_equal(result, expected)
Example #41
0
def _get_callable_regions(data):
    """Retrieve regions to parallelize by from callable regions, variant regions or chromosomes
    """
    callable_files = data.get("callable_regions") or data.get("variant_regions")
    if callable_files:
        assert len(callable_files) == 1
        regions = [(r.chrom, int(r.start), int(r.stop)) for r in pybedtools.BedTool(callable_files[0])]
    else:
        work_bam = list(tz.take(1, filter(lambda x: x.endswith(".bam"), data["work_bams"])))
        if work_bam:
            with contextlib.closing(pysam.Samfile(work_bam[0], "rb")) as pysam_bam:
                regions = [(chrom, 0, length) for (chrom, length) in zip(pysam_bam.references,
                                                                         pysam_bam.lengths)]
        else:
            raise NotImplementedError("No variant regions or BAM files to calculate chromosomes")
    return regions
def make_frames(frames, width, scale):
    incrementer = itertools.count()
    stencil = RightHandedSimplexStencil(2, 30)
    rotate = np.array([1, -1])
    offset = width / 2 + rotate * width / 10
    points = list(take(frames, stencil.generate_stencil_points()))
    for point in points:
        point.point = rotate * point.point * width / 12 + offset

    def make_frame(t):
        i = next(incrementer)
        surface = gizeh.Surface(width=width, height=width, bg_color=(1, 1, 1))

        line = gizeh.polyline([[offset[0], 0], [offset[0], width]], stroke=grid_color, stroke_width=2)
        line.draw(surface)
        line = gizeh.polyline([[0, offset[1]], [width, offset[1]]], stroke=grid_color, stroke_width=2)
        line.draw(surface)

        x = offset[0] + width/scale
        y = offset[1] - width/scale
        while x <= width + 1:
            line = gizeh.polyline([[x, 0], [x, width]], stroke=grid_color, stroke_width=0.5)
            line.draw(surface)
            line = gizeh.polyline([[0, y], [width, y]], stroke=grid_color, stroke_width=0.5)
            line.draw(surface)
            x += width/scale
            y -= width/scale
        x = offset[0] - width/scale
        y = offset[1] + width/scale
        while x >= -1:
            line = gizeh.polyline([[x, 0], [x, width]], stroke=grid_color, stroke_width=0.5)
            line.draw(surface)
            line = gizeh.polyline([[0, y], [width, y]], stroke=grid_color, stroke_width=0.5)
            line.draw(surface)
            x -= width/scale
            y += width/scale

        circle = gizeh.circle(r=3.25, xy=offset, fill=halving_colors[0])
        circle.draw(surface)
        if i > 0:
            for i in range(i-1):
                point = points[i]
                color = halving_colors[point.halvings]
                circle = gizeh.circle(r=max(0.5, 3.25 - 0.75*point.halvings), xy=point.point, fill=color)
                circle.draw(surface)
        return surface.get_npimage()
    return make_frame
Example #43
0
    def __init__(self, path, mode='rt', schema=None, columns=None, types=None,
            typehints=None, dialect=None, header=None, open=open,
            nrows_discovery=50, chunksize=1024,
            encoding=DEFAULT_ENCODING, **kwargs):
        if 'r' in mode and not os.path.isfile(path):
            raise ValueError('CSV file "%s" does not exist' % path)

        if schema is None and 'w' in mode:
            raise ValueError('Please specify schema for writable CSV file')

        self.path = path
        self.mode = mode
        self.open = {'gz': gzip.open, 'bz2': bz2.BZ2File}.get(ext(path), open)
        self._abspath = os.path.abspath(path)
        self.chunksize = chunksize
        self.encoding = encoding

        sample = get_sample(self)
        self.dialect = dialect = discover_dialect(sample, dialect, **kwargs)

        if header is None:
            header = has_header(sample, encoding=encoding)
        elif isinstance(header, int):
            header = True
        self.header = header

        if not schema and 'w' not in mode:
            schema = discover_csv(path, encoding=encoding, dialect=dialect,
                    header=self.header, typehints=typehints,
                    types=types, columns=columns,
                    nrows_discovery=nrows_discovery)
        self._schema = schema
        self.header = header

        if 'w' not in mode:
            try:
                nd.array(list(take(10, self._iter(chunksize=10))),
                         dtype=str(schema))
            except (ValueError, TypeError) as e:
                raise ValueError("Automatic datashape discovery failed\n"
                        "Discovered the following datashape: %s\n"
                        "But DyND generated the following error: %s\n"
                        "Consider providing type hints using "
                        "typehints={'column-name': 'type'}\n"
                        "like typehints={'start-time': 'string'}"
                        % (schema, e.args[0]))
Example #44
0
def _get_callable_regions(data):
    """Retrieve regions to parallelize by from callable regions, variant regions or chromosomes
    """
    import pybedtools
    callable_files = data.get("callable_regions") or data.get("variant_regions")
    if callable_files:
        assert len(callable_files) == 1
        regions = [(r.chrom, int(r.start), int(r.stop)) for r in pybedtools.BedTool(callable_files[0])]
    else:
        work_bam = list(tz.take(1, filter(lambda x: x.endswith(".bam"), data["work_bams"])))
        if work_bam:
            with pysam.Samfile(work_bam[0], "rb") as pysam_bam:
                regions = [(chrom, 0, length) for (chrom, length) in zip(pysam_bam.references,
                                                                         pysam_bam.lengths)]
        else:
            regions = [(r.name, 0, r.size) for r in
                       ref.file_contigs(dd.get_ref_file(data), data["config"])]
    return regions
Example #45
0
    def get_displacement(n_input, n_tilings):
        """
        Get the displacement vector to use in offsetting the tilings.

        Essentially, we look for numbers less than `n_tilings//2` that are 
        coprime with `n_tilings`. 
        If we can find at least `n_input` of them, we just take the first 
        `n_input`. If there are fewer such viable numbers, we instead cycle
        through the candidates, ensuring we repeat as seldom as possible.

        ..note::
            It's recommended by the CMAC people to just increase the number of 
            tilings when there aren't enough candidate values for the 
            displacement vector.
        """
        viable = [i for i in range(1, n_tilings//2) if gcd(i, n_tilings) == 1]
        ret = list(take(n_input, cycle(viable)))
        return np.array(ret)
Example #46
0
File: core.py Project: jcorbin/dask
def bag_range(n, npartitions):
    """ Numbers from zero to n

    Examples
    --------

    >>> import dask.bag as db
    >>> b = db.range(5, npartitions=2)
    >>> list(b)
    [0, 1, 2, 3, 4]
    """
    size = n // npartitions
    name = 'range-%d-npartitions-%d' % (n, npartitions)
    ijs = list(enumerate(take(npartitions, range(0, n, size))))
    dsk = dict(((name, i), (reify, (range, j, min(j + size, n))))
               for i, j in ijs)

    if n % npartitions != 0:
        i, j = ijs[-1]
        dsk[(name, i)] = (reify, (range, j, n))

    return Bag(dsk, name, npartitions)
Example #47
0
def roll(request):
    form = forms.LunchGroupForm

    if request.method == 'GET':
        ParticipantFormset = modelformset_factory(models.Participant)
        formset = ParticipantFormset(queryset=models.Participant.objects.filter(is_participating=True))
        context = {'form': form(), 
                    'formset': formset
                    }
        return render(request, 'lunch_roulette/base.html', context)

    if request.method == 'POST':
        form = form(request.POST)
        if not form.is_valid():
            context['messages'] = ['Date is not valid']
            render(request, 'lunch_roulette/base.html', context)                
        date = form.cleaned_data.get('date')

        participants = list(models.Participant.objects.filter(is_participating=True))
        random.shuffle(participants)

        while 1:
            subgroup = list(toolz.take(4, participants))
            participants = participants[4:]
            logging.warn(subgroup)
            if not subgroup:
                break

            group = models.LunchGroup(date=date)
            group.save()

            group.participants.add(*subgroup)
            group.save()


        # import pdb; pdb.set_trace()

        return redirect(roll)
Example #48
0
def test_cast_string_to_date(alltypes, df, type):
    import toolz

    string_col = alltypes.date_string_col
    month, day, year = toolz.take(3, string_col.split('/'))

    expr = '20' + ibis.literal('-').join([year, month, day])
    expr = expr.cast(type)

    result = (
        expr.execute()
        .astype('datetime64[ns]')
        .sort_values()
        .reset_index(drop=True)
        .rename('date_string_col')
    )
    expected = (
        pd.to_datetime(df.date_string_col)
        .dt.normalize()
        .sort_values()
        .reset_index(drop=True)
    )
    tm.assert_series_equal(result, expected)
Example #49
0
    def clean(name,
              before=None,
              after=None,
              keep_last=None,
              environ=os.environ):
        """Clean up data that was created with ``ingest`` or
        ``$ python -m zipline ingest``

        Parameters
        ----------
        name : str
            The name of the bundle to remove data for.
        before : datetime, optional
            Remove data ingested before this date.
            This argument is mutually exclusive with: keep_last
        after : datetime, optional
            Remove data ingested after this date.
            This argument is mutually exclusive with: keep_last
        keep_last : int, optional
            Remove all but the last ``keep_last`` ingestions.
            This argument is mutually exclusive with:
              before
              after
        environ : mapping, optional
            The environment variables. Defaults of os.environ.

        Returns
        -------
        cleaned : set[str]
            The names of the runs that were removed.

        Raises
        ------
        BadClean
            Raised when ``before`` and or ``after`` are passed with
            ``keep_last``. This is a subclass of ``ValueError``.
        """
        try:
            all_runs = sorted(
                filter(
                    complement(pth.hidden),
                    os.listdir(pth.data_path([name], environ=environ)),
                ),
                key=from_bundle_ingest_dirname,
            )
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise
            raise UnknownBundle(name)
        if ((before is not None or after is not None) and
                keep_last is not None):
            raise BadClean(before, after, keep_last)

        if keep_last is None:
            def should_clean(name):
                dt = from_bundle_ingest_dirname(name)
                return (
                    (before is not None and dt < before) or
                    (after is not None and dt > after)
                )

        elif keep_last >= 0:
            last_n_dts = set(take(keep_last, reversed(all_runs)))

            def should_clean(name):
                return name not in last_n_dts
        else:
            raise BadClean(before, after, keep_last)

        cleaned = set()
        for run in all_runs:
            if should_clean(run):
                path = pth.data_path([name, run], environ=environ)
                shutil.rmtree(path)
                cleaned.add(path)

        return cleaned
from itertools import repeat
from functools import partial
import types

import pytest
import toolz as tlz
map_c = tlz.curry(tlz.map)
reduce_c = tlz.curry(tlz.reduce)

from smpl_tokenizer import utils


is_generator = lambda obj: isinstance(obj, types.GeneratorType)

var_len_strings = lambda n: list(tlz.take(n, tlz.iterate(lambda string: string + "a", "")))


"""
@pytest.mark.parametrize("test_input,expected", [
        ("3+5", 8),
        ("2+4", 6),
        ("6*9", 42),
])
def _eval(test_input, expected):
        assert _eval(test_input) == expected


@pytest.mark.parametrize("x", [0, 1])
@pytest.mark.parametrize("y", [2, 3])
def test_foo(x, y):
Example #51
0
def estimate_max_mapq(in_bam, nreads=1e6):
    """Guess maximum MAPQ in a BAM file of reads with alignments
    """
    with pysam.Samfile(in_bam, "rb") as work_bam:
        reads = tz.take(nreads, work_bam)
        return max([x.mapq for x in reads if not x.is_unmapped])
def degrade_latin(para,
        om_frac   = 0.1,
        com_frac  = 0.1,
        max_N_om  = 5,
        max_N_com = 5,
        ):
    '''
    'latin' because this tokenizes using str.split

    takes a (str)paragraph
    returns a (str)paragraph'
    with possible degradations (errors):
    - omissions (deletions)
    - commissions (alterations)

    arguments:
    om_ratio, com_ratio:
        fraction of items to alter,
        where the basis is the number of TOKENS
                        
    max_N_om, max_N_com:
        maximum whole number count of
        tokens to alter
    '''

    buf = para.split()
    ntoken_ = len(buf)

    # a convenience rendered version
    html_rep = copy.copy(buf)
    result = EasyDict(
        omission_index_list = [],
        commission_index_list = [],
    )

    OM_LIMIT = int(math.ceil(om_frac*ntoken_))
    COM_LIMIT = int(math.ceil(com_frac*ntoken_))

    # run omissions first
    ilist = range(ntoken_)
    random.shuffle(ilist)
    result.omission_index_list = list(z.take(min(OM_LIMIT, max_N_om), ilist))
    result.omission_index_list.sort()

    for i in reversed(result.omission_index_list):
        del buf[i]
        html_rep[i] = '<span class="deleted">%s</span>'%html_rep[i]

    # THIS HAS CHANGED!
    ntoken_ = len(buf)

    # create new index -> original index mapping
    imapping = dict((i,i) for i in range(ntoken_))
    for i_deleted in result.omission_index_list:
        for i_inc in range(i_deleted, ntoken_):
            imapping[i_inc] += 1

    # then run commissions
    ilist = range(ntoken_)
    random.shuffle(ilist)
    com_idx_list = z.take(min(COM_LIMIT, max_N_com), ilist)
    for i in reversed(sorted(com_idx_list)):
        token = buf[i]
        j_degrade = random.randint(0, len(token)-1)
        while True:
            ch = random.choice(string.ascii_lowercase)
            if ch != token[j_degrade]:
                break
        buf[i] = token[:j_degrade]+ch+token[j_degrade+1:]
        original_index = imapping[i]
        #result.commission_index_list.append(original_index)
        html_rep[original_index] = \
                token[:j_degrade] + \
                '<span class="altered">%s</span>'%ch + \
                token[j_degrade+1:]

    result.text = ' '.join(buf)
    result.html_representation = ' '.join(html_rep)
    return result
Example #53
0
def test_bz2_stream():
    text = '\n'.join(map(str, range(10000)))
    compressed = bz2.compress(text.encode())
    assert (list(take(100, bz2_stream(compressed))) ==
            list(map(lambda x: str(x) + '\n', range(100))))
Example #54
0
from __future__ import print_function, division, unicode_literals
from toolz import take, compose, pluck
import matplotlib.pyplot as plt
from pylsy2 import pylsytable2
from utility import until_within_tol
from func_gradient_descent import gradient_descent
from out_utils import plot_lrates


def f(x_i):
    return sum(x_ij**2 for x_ij in x_i)
        

def df(x_i):
    return [2 * x_ij for x_ij in x_i]


x0 = [6., 33., 12.2]
tol = 1.e-6
al = [1., 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
niter = 100

plot_lrates(f, df, x0, al, niter)

result = list(take(50, ((f(e), e) for e in gradient_descent(df, x0)) ))
xs = ['x' + unicode(i) for i in xrange(len(x0))]
table = pylsytable2(['y'] + xs)
table.add_data('y', list(pluck(0, result)), '{:.2e}')
for i, x in enumerate(xs):
    table.add_data(x, list(pluck(i,pluck(1, result))), '{:.2e}')
print(table)
Example #55
0
 def initial(s):
     return list(take(n, s))
Example #56
0
 def _assert_initial_matches(a, b, n=10):
     assert list(take(n, a)) == list(take(n, b))
Example #57
0
 def f(_):
     sub = Sub('a')
     return list(toolz.take(5, sub))