Example #1
0
def compute_rouge_l_sent_level(outputs, reference, mode='f'):
    """ compute ROUGE-L for a single pair of summary and reference
    output, reference are list of words
    """
    # reference list of sents sents are list of words
    # output list of sents sents are list of words
    assert mode in list('fpr')  # F-1, precision, recall
    lcs = 0
    word_count = 0
    lcs_r = 0
    sum_count = 0
    for output in outputs:
        lcs += _lcs_len(output, list(concat(reference)))
        word_count += len(output)
    for ref in reference:
        lcs_r += _lcs_len(ref, list(concat(output)))
        sum_count += len(ref)
    if lcs == 0:
        score = 0.0
    else:
        precision = lcs / word_count
        recall = lcs_r / sum_count
        f_score = 2 * (precision * recall) / (precision + recall)
        if mode == 'p':
            score = precision
        elif mode == 'r':
            score = recall
        else:
            score = f_score
    return score
    async def poll_erc20_logs_loop(self):
        while True:
            try:
                new_blocks: List[AttributeDict] = await self._new_blocks_queue.get()

                transfer_tasks = []
                approval_tasks = []
                for address in self._addresses_to_contracts.keys():
                    contract_event_logger: ContractEventLogger = self._contract_event_loggers[address]
                    transfer_tasks.append(
                        contract_event_logger.get_new_entries_from_logs(TRANSFER_EVENT_NAME,
                                                                        new_blocks)
                    )
                    approval_tasks.append(
                        contract_event_logger.get_new_entries_from_logs(APPROVAL_EVENT_NAME,
                                                                        new_blocks)
                    )

                raw_transfer_entries = await safe_gather(*transfer_tasks)
                raw_approval_entries = await safe_gather(*approval_tasks)
                transfer_entries = list(cytoolz.concat(raw_transfer_entries))
                approval_entries = list(cytoolz.concat(raw_approval_entries))
                for transfer_entry in transfer_entries:
                    await self._handle_event_data(transfer_entry)
                for approval_entry in approval_entries:
                    await self._handle_event_data(approval_entry)

            except asyncio.CancelledError:
                raise
            except asyncio.TimeoutError:
                continue
            except Exception:
                self.logger().network("Error fetching new events from ERC20 contracts.", exc_info=True,
                                      app_warning_msg="Error fetching new events from ERC20 contracts. "
                                                      "Check wallet network connection")
Example #3
0
def a2c_validate(agent, abstractor, loader):
    agent.eval()
    start = time()
    print('start running validation...', end='')
    avg_reward = 0
    i = 0
    with torch.no_grad():
        for art_batch, abs_batch in loader:
            ext_sents = []
            ext_inds = []
            for raw_arts in art_batch:
                indices = agent(raw_arts)
                ext_inds += [(len(ext_sents), len(indices)-1)]
                ext_sents += [raw_arts[idx.item()]
                              for idx in indices if idx.item() < len(raw_arts)]
            all_summs = abstractor(ext_sents)
            for (j, n), abs_sents in zip(ext_inds, abs_batch):
                summs = all_summs[j:j+n]
                # python ROUGE-1 (not official evaluation)
                avg_reward += compute_rouge_n(list(concat(summs)),
                                              list(concat(abs_sents)), n=1)
                i += 1
    avg_reward /= (i/100)
    print('finished in {}! avg reward: {:.2f}'.format(
        timedelta(seconds=int(time()-start)), avg_reward))
    return {'reward': avg_reward}
Example #4
0
def compute_rouge_l_summ(summs, refs, mode='f'):
    """ summary level ROUGE-L"""
    assert mode in list('fpr')  # F-1, precision, recall
    tot_hit = 0
    ref_cnt = Counter(concat(refs))
    summ_cnt = Counter(concat(summs))
    for ref in refs:
        for summ in summs:
            lcs = _lcs(summ, ref)
            for gram in lcs:
                if ref_cnt[gram] > 0 and summ_cnt[gram] > 0:
                    tot_hit += 1
                ref_cnt[gram] -= 1
                summ_cnt[gram] -= 1
    if tot_hit == 0:
        score = 0.0
    else:
        precision = tot_hit / sum((len(s) for s in summs))
        recall = tot_hit / sum((len(r) for r in refs))
        f_score = 2 * (precision * recall) / (precision + recall)
        if mode == 'p':
            score = precision
        if mode == 'r':
            score = recall
        else:
            score = f_score
    return score
Example #5
0
def compute_rouge_l_summ(summs, refs, mode='f'):
    """ summary level ROUGE-L"""
    assert mode in list('fpr')  # F-1, precision, recall
    tot_hit = 0
    ref_cnt = Counter(concat(refs))
    summ_cnt = Counter(concat(summs))
    for ref in refs:
        for summ in summs:
            lcs = _lcs(summ, ref)
            for gram in lcs:
                if ref_cnt[gram] > 0 and summ_cnt[gram] > 0:
                    tot_hit += 1
                ref_cnt[gram] -= 1
                summ_cnt[gram] -= 1
    if tot_hit == 0:
        score = 0.0
    else:
        precision = tot_hit / sum((len(s) for s in summs))
        recall = tot_hit / sum((len(r) for r in refs))
        f_score = 2 * (precision * recall) / (precision + recall)
        if mode == 'p':
            score = precision
        if mode == 'r':
            score = recall
        else:
            score = f_score
    return score
Example #6
0
def test_gen():
    from zbox import gen
    c = toolz.concat([[1], [2], [3]])
    g = gen(c)
    assert not isinstance(c, types.GeneratorType)
    assert isinstance(g, types.GeneratorType)
    assert list(g) == list(toolz.concat([[1], [2], [3]]))
    async def poll_erc20_logs_loop(self):
        while True:
            try:
                new_blocks: List[AttributeDict] = await self._new_blocks_queue.get()
                block_hashes: List[HexBytes] = [block["hash"] for block in new_blocks]

                transfer_tasks = []
                approval_tasks = []
                for address in self._addresses_to_contracts.keys():
                    contract_event_logger: ContractEventLogger = self._contract_event_loggers[address]
                    transfer_tasks.append(
                        contract_event_logger.get_new_entries_from_logs(TRANSFER_EVENT_NAME,
                                                                        block_hashes)
                    )
                    approval_tasks.append(
                        contract_event_logger.get_new_entries_from_logs(APPROVAL_EVENT_NAME,
                                                                        block_hashes)
                    )

                raw_transfer_entries = await asyncio.gather(*transfer_tasks)
                raw_approval_entries = await asyncio.gather(*approval_tasks)
                transfer_entries = list(cytoolz.concat(raw_transfer_entries))
                approval_entries = list(cytoolz.concat(raw_approval_entries))
                for transfer_entry in transfer_entries:
                    await self._handle_event_data(transfer_entry)
                for approval_entry in approval_entries:
                    await self._handle_event_data(approval_entry)

            except asyncio.CancelledError:
                raise
            except asyncio.TimeoutError:
                continue
            except Exception:
                self.logger().error("Unknown error trying to fetch new events from ERC20 contracts.", exc_info=True)
Example #8
0
def a2c_validate(agent, abstractor, loader):
    agent.eval()
    start = time()
    print('start running validation...', end='')
    avg_reward = 0
    i = 0
    with torch.no_grad():
        for art_batch, topic_batch, abs_batch in loader:
            ext_sents = []
            ext_inds = []
            for raw_arts, topic in zip(art_batch, topic_batch):
                indices = agent(raw_arts, topic)
                ext_inds += [(len(ext_sents), len(indices) - 1)]
                ext_sents += [
                    raw_arts[idx.item()] for idx in indices
                    if idx.item() < len(raw_arts)
                ]
            all_summs = abstractor(ext_sents)
            for (j, n), abs_sents in zip(ext_inds, abs_batch):
                summs = all_summs[j:j + n]
                # python ROUGE-1 (not official evaluation)
                avg_reward += compute_rouge_n(list(concat(summs)),
                                              list(concat(abs_sents)),
                                              n=1)
                i += 1
    avg_reward /= (i / 100)
    print('finished in {}! avg reward: {:.2f}'.format(
        timedelta(seconds=int(time() - start)), avg_reward))
    return {'reward': avg_reward}
Example #9
0
def get_all_leaf_paths(coll):
    """Returns a list of paths to all leaf nodes in a nested dict.

    Paths can travel through lists and the index is inserted into the
    path.
    """
    if isinstance(coll, Mapping):
        return list(
            tz.concat(
                map(
                    lambda t: list(map(lambda p: [t[0]] + p, get_all_leaf_paths(t[1]))),
                    coll.items(),
                )
            )
        )

    elif isinstance(coll, list):
        return list(
            tz.concat(
                map(
                    lambda t: list(map(lambda p: [t[0]] + p, get_all_leaf_paths(t[1]))),
                    enumerate(coll),
                )
            )
        )
    else:
        return [[]]
Example #10
0
    def evaluate_subset(self, subset):

        #External call to looping function
        if subset == False:
            l1_cost, l2_match_cost = get_subset(self.candidates, self.costs,
                                                self.matches, self.pointers)

        else:
            l1_cost, l2_match_cost = get_subset(self.candidates[subset],
                                                self.costs[subset],
                                                self.matches[subset],
                                                self.pointers[subset])

        #Find unencoded indexes
        if subset == False:
            unencoded_indexes = list(
                ct.concat([self.indexes[i] for i in range(len(self.indexes))]))
            unencoded_indexes = self.max_index - len(
                list(ct.unique(unencoded_indexes)))

        else:
            unencoded_indexes = list(
                ct.concat([self.indexes[i] for i in subset]))
            unencoded_indexes = self.max_index - len(
                list(ct.unique(unencoded_indexes)))

        #Use unencoded indexes to get regret cost
        #Regret cost applied twice, once for encoding and once for grammar
        if unencoded_indexes > 0:
            if subset == False:
                unencoded_cost = -math.log2(float(1.0 / (unencoded_indexes)))
                l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2

            else:
                unencoded_cost = -math.log2(
                    float(1.0 / (unencoded_indexes + len(subset))))
                l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2

        else:
            l2_regret_cost = 0

        #Total all terms
        total_mdl = l1_cost + l2_match_cost + l2_regret_cost

        #DEBUGGING
        print("\t\tMDL: " + str(total_mdl))
        print("\t\tL1 Cost: " + str(l1_cost))
        print("\t\tL2 Match Cost: " + str(l2_match_cost))
        print("\t\tL2 Regret Cost: " + str(l2_regret_cost))
        print("\t\tEncoded: " + str(self.max_index - unencoded_indexes))
        print("\t\tUnencoded: " + str(unencoded_indexes))

        #Calculate baseline
        if subset == False:
            baseline_cost_per = -math.log2(float(1.0 / self.max_index))
            baseline_mdl = baseline_cost_per * self.max_index
            print("\t\tBaseline: " + str(baseline_mdl))
            print("\t\tRatio: " + str(total_mdl / baseline_mdl))

        return total_mdl
Example #11
0
	def evaluate_subset(self, subset):
	
		#External call to looping function
		if subset == False:
			l1_cost, l2_match_cost = get_subset(self.candidates, 
													self.costs, 
													self.matches, 
													self.pointers
													)
													
		else:
			l1_cost, l2_match_cost = get_subset(self.candidates[subset], 
													self.costs[subset], 
													self.matches[subset], 
													self.pointers[subset]
													)
		
		#Find unencoded indexes
		if subset == False:
			unencoded_indexes = list(ct.concat([self.indexes[i] for i in range(len(self.indexes))]))
			unencoded_indexes = self.max_index - len(list(ct.unique(unencoded_indexes)))
		
		else:
			unencoded_indexes = list(ct.concat([self.indexes[i] for i in subset]))
			unencoded_indexes = self.max_index - len(list(ct.unique(unencoded_indexes)))

		#Use unencoded indexes to get regret cost
		#Regret cost applied twice, once for encoding and once for grammar
		if unencoded_indexes > 0:
			if subset == False:
				unencoded_cost = -math.log2(float(1.0/(unencoded_indexes)))
				l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2

			else:
				unencoded_cost = -math.log2(float(1.0/(unencoded_indexes + len(subset))))
				l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2
		
		else:
			l2_regret_cost = 0
		
		#Total all terms
		total_mdl = l1_cost + l2_match_cost + l2_regret_cost
				
		#DEBUGGING
		print("\t\tMDL: " + str(total_mdl))
		print("\t\tL1 Cost: " + str(l1_cost))
		print("\t\tL2 Match Cost: " + str(l2_match_cost))
		print("\t\tL2 Regret Cost: " + str(l2_regret_cost))
		print("\t\tEncoded: " + str(self.max_index - unencoded_indexes))
		print("\t\tUnencoded: " + str(unencoded_indexes))
		
		#Calculate baseline
		if subset == False:
			baseline_cost_per = -math.log2(float(1.0/self.max_index))
			baseline_mdl = baseline_cost_per * self.max_index
			print("\t\tBaseline: " + str(baseline_mdl))
			print("\t\tRatio: " + str(total_mdl/baseline_mdl))		
		
		return total_mdl
Example #12
0
def coll_fn(data):
    source_lists, target_lists = unzip(data)
    # NOTE: independent filtering works because
    #       source and targets are matched properly by the Dataset
    sources = list(filter(bool, concat(source_lists)))
    targets = list(filter(bool, concat(target_lists)))
    assert all(sources) and all(targets)
    return sources, targets
Example #13
0
def coll_fn(data):
    source_lists, target_lists = unzip(data)
    # NOTE: independent filtering works because
    #       source and targets are matched properly by the Dataset
    sources = list(filter(bool, concat(source_lists)))
    targets = list(filter(bool, concat(target_lists)))
    assert all(sources) and all(targets)
    return sources, targets
Example #14
0
def a2c_validate(agent, abstractor, loader):
    agent.eval()
    start = time()
    print('start running validation...', end='')
    avg_reward = 0
    i = 0
    with torch.no_grad():
        for art_batch, abs_batch, extract in loader:
            greedy_inputs = []
            for idx, raw_arts in enumerate(art_batch):
                greedy, sample, log_probs = agent(raw_arts,
                                                  sample_time=1,
                                                  validate=True)
                sample = sample[0]
                log_probs = log_probs[0]
                greedy_sents = [raw_arts[ind] for ind in greedy]
                greedy_sents = [word for sent in greedy_sents for word in sent]
                #print(greedy_sents)
                #greedy_sents = list(concat(greedy_sents))
                greedy_sents = []
                ext_sent = []
                for ids in greedy:
                    if ids < len(raw_arts):
                        if ids == 0:
                            if ext_sent:
                                greedy_sents.append(ext_sent)
                            ext_sent = []
                        else:
                            ext_sent += raw_arts[ids]
                if greedy[-1] != 0 and ext_sent:
                    greedy_sents.append(ext_sent)
                #print(greedy_sents)
                #exit()
                greedy_inputs.append(greedy_sents)
            greedy_abstracts = []
            for abs_src in greedy_inputs:
                with torch.no_grad():
                    greedy_outputs = abstractor(abs_src)
                #greedy_abstract = []
                #for greedy_sents in greedy_outputs:
                #    greedy_sents = sent_tokenize(' '.join(greedy_sents))
                #    greedy_sents = [sent.strip().split(' ') for sent in greedy_sents]
                #    greedy_abstract += greedy_sents
                greedy_abstract = list(concat(greedy_outputs))
                greedy_abstracts.append(greedy_abstract)
            for idx, greedy_sents in enumerate(greedy_abstracts):
                abss = abs_batch[idx]
                bs = compute_rouge_n(greedy_sents, list(concat(abss)))
                avg_reward += bs
                i += 1
                #print(i)
                #print(avg_reward)
                #exit()
    avg_reward /= (i / 100)
    print('finished in {}! avg reward: {:.2f}'.format(
        timedelta(seconds=int(time() - start)), avg_reward))
    return {'reward': avg_reward}
Example #15
0
def a2c_validate(agent, abstractor, loader):
    agent.eval()
    start = time()
    print('start running validation...', end='')
    avg_reward = 0
    i = 0
    with torch.no_grad():
        for art_batch, abs_batch, sent_batch in loader:
            print(i)
            ext_sents = []
            ext_inds = []
            masks = []
            dirty = []
            for raw_arts, sent_labels in zip(art_batch, sent_batch):
                indices = agent(raw_arts, sent_labels)
                ext_inds += [(len(ext_sents), len(indices) - 1)]
                assert indices[-1][-1].item() == len(raw_arts) + 1
                tmp_stop = indices[-1][-1].item()
                tmp_truncate = tmp_stop - 1
                str_arts = list(map(lambda x: ' '.join(x), raw_arts))
                for idx in indices:
                    t, m = rl_edu_to_sentence(str_arts, idx)
                    if t == []:
                        assert len(idx) == 1
                        id = idx[0].item()
                        if id == tmp_truncate:
                            dirty.append(len(ext_sents))
                            ext_sents.append(label)
                            masks.append(label_mask)
                    else:
                        if idx[-1].item() != tmp_stop:
                            ext_sents.append(t)
                            masks.append(m)
            all_summs = abstractor(ext_sents, masks)
            for d in dirty:
                all_summs[d] = []
            for (j, n), abs_sents in zip(ext_inds, abs_batch):
                summs = all_summs[j:j + n]
                # python ROUGE-1 (not official evaluation)
                avg_reward += compute_rouge_n(list(concat(summs)),
                                              list(concat(abs_sents)),
                                              n=1)
                i += 1
                if i % 100 == 1:
                    print(avg_reward / i, i)
                '''
                with open('./compare/rl/' + str(i - 1) + '.dec', 'w') as f:
                    for s in summs:
                        s = ' '.join(s)
                        f.write(s + '\n')
                '''
            #if i > 1000:
            #    break
    avg_reward /= (i / 100)
    print('finished in {}! avg reward: {:.2f}'.format(
        timedelta(seconds=int(time() - start)), avg_reward))
    return {'reward': avg_reward}
Example #16
0
def batchify_fn(pad, data, cuda=True):
    source_lists, targets = tuple(map(list, unzip(data)))

    sources = pad_batch_tensorize(inputs=list(concat(source_lists)), pad=pad, cuda=cuda)
    tensor_type = torch.cuda.LongTensor if cuda else torch.LongTensor
    target = tensor_type(list(concat(targets)))

    fw_args = (sources,)
    loss_args = (target,)
    return fw_args, loss_args
Example #17
0
def summarize_text(doc: Doc):
    '''
    Reduces a large doc to a few sentences at the beginning middle and end of the document.
    '''
    sentences = list(sent.text for sent in doc.sents)
    doc_start = " ".join(tlz.concat(sentences[:2]))
    mid_i = int(len(sentences) / 2)
    doc_mid = " ".join(tlz.concat(sentences[mid_i:mid_i + 2]))
    doc_end = " ".join(tlz.concat(sentences[-2:]))

    return f'{doc_start}\n...\n{doc_mid}\n...\n{doc_end}'
Example #18
0
    async def _download_receipts(self,
                                 target_td: int,
                                 all_headers: Tuple[BlockHeader, ...]) -> None:
        """
        Downloads and persists the receipts for the given set of block headers.
        Receipts are requested from all peers in equal sized batches.
        """
        # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
        # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
        # so we do this to avoid requesting the same receipts multiple times.
        headers = tuple(unique(
            (header for header in all_headers if not _is_receipts_empty(header)),
            key=operator.attrgetter('receipt_root'),
        ))

        while headers:
            # split the remaining headers into equal sized batches for each peer.
            peers = cast(Tuple[ETHPeer, ...], self.peer_pool.get_peers(target_td))
            if not peers:
                raise NoEligiblePeers(
                    "No connected peers have the receipts we need for td={0}".format(target_td)
                )
            batch_size = math.ceil(len(headers) / len(peers))
            batches = tuple(partition_all(batch_size, headers))

            # issue requests to all of the peers and wait for all of them to respond.
            requests = tuple(
                self._get_receipts(peer, batch)
                for peer, batch
                in zip(peers, batches)
            )
            responses = await self.wait(asyncio.gather(
                *requests,
                loop=self.get_event_loop(),
            ))

            # extract the returned receipt data and the headers for which we
            # are still missing receipts.
            all_receipt_bundles, all_missing_headers = zip(*responses)
            receipt_bundles = tuple(concat(all_receipt_bundles))
            headers = tuple(concat(all_missing_headers))

            if len(receipt_bundles) == 0:
                continue

            # process all of the returned receipts, storing their trie data
            # dicts in the database
            receipts, trie_roots_and_data_dicts = zip(*receipt_bundles)
            trie_roots, trie_data_dicts = zip(*trie_roots_and_data_dicts)
            for trie_data in trie_data_dicts:
                await self.wait(self.db.coro_persist_trie_data_dict(trie_data))

        self.logger.debug("Got receipts batch for %d headers", len(all_headers))
Example #19
0
def a2c_validate(agent, abstractor, loader):
    agent.eval()
    start = time()
    print('start running validation...', end='')
    avg_reward = 0
    i = 0
    with torch.no_grad():
        for art_batch, abs_batch, ext_batch in loader:
            ext_sents = []
            ext_inds = []
            sent_acts = []
            for raw_arts in art_batch:
                (indices, _), actions = agent(raw_arts)
                ext_inds += [(len(ext_sents), len(indices) - 1)]
                ext_sents += [
                    raw_arts[idx.item()] for idx in indices
                    if idx.item() < len(raw_arts)
                ]

                sent_acts += [
                    actions[j] for j, idx in enumerate(indices)
                    if idx.item() < len(raw_arts)
                ]

            assert len(ext_sents) == len(sent_acts)

            all_summs = []
            need_abs_sents = [
                ext_sents[iters] for iters, act in enumerate(sent_acts)
                if act == 0
            ]
            if len(need_abs_sents) > 0:
                turn_abs_sents = abstractor(need_abs_sents)

            for nums, action in enumerate(sent_acts):
                if action == 0:
                    all_summs += turn_abs_sents.pop(0)
                else:
                    all_summs += ext_sents[nums]

            for (j, n), abs_sents in zip(ext_inds, abs_batch):
                summs = all_summs[j:j + n]
                # python ROUGE-1 (not official evaluation)
                avg_reward += compute_rouge_n(list(concat(summs)),
                                              list(concat(abs_sents)),
                                              n=1)
                i += 1
    avg_reward /= (i / 100)
    print('finished in {}! avg reward: {:.2f}'.format(
        timedelta(seconds=int(time() - start)), avg_reward))
    return {'reward': avg_reward}
Example #20
0
    async def _download_block_bodies(
        self, target_td: int, all_headers: Tuple[BlockHeader, ...]
    ) -> Dict[Tuple[Hash32, Hash32], BlockBody]:
        """
        Downloads and persists the block bodies for the given set of block headers.
        Block bodies are requested from all peers in equal sized batches.
        """
        headers = tuple(header for header in all_headers
                        if not _is_body_empty(header))
        block_bodies_by_key: Dict[Tuple[Hash32, Hash32], BlockBody] = {}

        while headers:
            # split the remaining headers into equal sized batches for each peer.
            peers = cast(Tuple[ETHPeer, ...],
                         self.peer_pool.get_peers(target_td))
            if not peers:
                raise NoEligiblePeers(
                    "No connected peers have the block bodies we need for td={0}"
                    .format(target_td))
            batch_size = math.ceil(len(headers) / len(peers))
            batches = tuple(partition_all(batch_size, headers))

            # issue requests to all of the peers and wait for all of them to respond.
            requests = tuple(
                self._get_block_bodies(peer, batch)
                for peer, batch in zip(peers, batches))
            responses = await self.wait(
                asyncio.gather(
                    *requests,
                    loop=self.get_event_loop(),
                ))

            # extract the returned block body data and the headers for which we
            # are still missing block bodies.
            all_block_body_bundles, all_missing_headers = zip(*responses)

            for (body, (tx_root, trie_data_dict),
                 uncles_hash) in concat(all_block_body_bundles):
                await self.wait(
                    self.db.coro_persist_trie_data_dict(trie_data_dict))

            block_bodies_by_key = merge(
                block_bodies_by_key,
                {(transaction_root, uncles_hash): block_body
                 for block_body, (transaction_root, trie_dict_data),
                 uncles_hash in concat(all_block_body_bundles)})
            headers = tuple(concat(all_missing_headers))

        self.logger.debug("Got block bodies batch for %d headers",
                          len(all_headers))
        return block_bodies_by_key
Example #21
0
def test_mapped():
    _cfg = cfg.get('chipmunk-ard', env=test.env)

    chipmap = chips.mapped(x=test.x,
                           y=test.y,
                           acquired=test.acquired,
                           specmap=specs.mapped(
                               ubids=cfg.ubids.get('chipmunk-ard'),
                               specs=_cfg.get('registry_fn')()),
                           chips_fn=_cfg.get('chips_fn'))

    assert len(chipmap) > 0
    assert all(map(lambda x: type(x) is dict, concat(chipmap.values())))
    assert len(list(concat(chipmap.values()))) > 0
def prepro_rl_graph(tokenized_sents,
                    nodes,
                    edges,
                    paras,
                    subgraphs,
                    adj_type='edge_as_node',
                    docgraph=True):
    max_len = len(list(concat(tokenized_sents)))
    _, word_inpara_freq_feat, _, sent_inpara_freq_feat = create_word_freq_in_para_feat(
        paras, tokenized_sents, list(concat(tokenized_sents)))
    if docgraph:
        nodewords, nodelength, nodefreq, sum_worthy, triples, relations, sent_node_aligns = process_nodes(
            nodes,
            edges,
            max_len,
            max_sent_num=len(list(tokenized_sents)),
            key='InSalientSent',
            adj_type=adj_type)
        nodes = (nodewords, nodefreq, word_inpara_freq_feat,
                 sent_inpara_freq_feat, triples, relations, sent_node_aligns)
    else:
        nodewords, node_lists, nodefreq, sum_worthy, triples, relations = process_subgraphs(
            nodes,
            edges,
            subgraphs,
            paras,
            max_len,
            max_sent=len(list(tokenized_sents)),
            key='InSalientSent',
            adj_type=adj_type)
        sent_align_para = []
        last_idx = 0
        for sent in range(len(tokenized_sents)):
            flag = False
            for _idx, para in enumerate(paras):
                if sent in para:
                    sent_align_para.append([_idx])
                    last_idx = _idx
                    flag = True
                    break
            if not flag:
                sent_align_para.append([last_idx])
        assert len(sent_align_para) == len(tokenized_sents)
        sent_align_para.append([last_idx + 1])
        nodes = (nodewords, nodefreq, word_inpara_freq_feat,
                 sent_inpara_freq_feat, triples, relations, sent_align_para,
                 node_lists)

    return nodes
def process_file(file, language, workers = 64):

	start = time.time()
	
	while True:
		try:
			df = pd.read_csv(file)
			break
		except Exception as e:
			print(e)
			time.sleep(10)
			
	pages = df.loc[:,"Text"].values
	del df
	pages = [str(x).split("\n") for x in pages]
	pages = list(ct.concat(pages))

	#Multi-process
	pool_instance = mp.Pool(processes = workers, maxtasksperchild = None)
	codes = pool_instance.map(get_lid, pages, chunksize = 100)
	pool_instance.close()
	pool_instance.join()
	
	pages = [pages[i] for i in range(len(pages)) if codes[i][0] == language and codes[i][1] == language]
	print("\t" + file + "  " + str(time.time() - start) + "  with  " + str(len(pages)))
	
	return pages
Example #24
0
	def run_pearson_prune(self, vector_array, class_array, significance_level, cor_level):

		time_start = time.time()
		
		print("Multi-processing Pearson's R feature pruning:")
		
		#Multi-process Pearson pruning#
		pool_instance=mp.Pool(processes = self.workers, maxtasksperchild = 1)
		remove_list = pool_instance.map(partial(self.process_pearson_prune, 
													vector_array = vector_array, 
													significance_level = significance_level,
													cor_level = cor_level,
													max = vector_array.shape[1]
													), [i for i in range(0, vector_array.shape[1])], chunksize = 1)
		pool_instance.close()
		pool_instance.join()
		
		remove_list = list(ct.concat(remove_list))
		remove_list = list(set(remove_list))

		vector_array = vector_array[:, [x for x in range(0, vector_array.shape[1]) if x not in remove_list]]	
		
		print("")
		print("Features above correlation threshold (" + str(cor_level) + "): " + str(len(remove_list)))
		print("Time for completion: " + str((float(time.time())) - time_start))
		print("Finished with Pearson R Feature Pruning.")
		print("")
		
		return vector_array
Example #25
0
def vsm_collate(inputs):
    (video_inputs, vids, sub_queries_and_targets) = map(list, unzip(inputs))
    (input_ids, attn_masks, sub_vids, targets) = map(
        list, unzip(concat(outs for outs in sub_queries_and_targets)))

    batch = video_collate(video_inputs)
    vid2idx = {vid: i for i, vid in enumerate(vids)}
    batch["q_vidx"] = torch.tensor([vid2idx[s_vid] for s_vid in sub_vids],
                                   dtype=torch.long)

    # text batches
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=1)
    position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long
                                ).unsqueeze(0)
    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)

    vsm_targets = pad_sequence(
        targets, batch_first=True, padding_value=-1)
    batch.update({
        'query_input_ids': input_ids,
        'query_pos_ids': position_ids,
        'query_attn_masks': attn_masks,
        'targets': vsm_targets,
        'vids': vids})

    return batch
Example #26
0
 def tags(beamline, runs):
     hi_tags = fromiter(map(read_hightagnumber(beamline), runs), 'int')
     if not hi_tags.all():
         raise ValueError('Not all the runs have a single high tag!')
     hi_tag = hi_tags[0]
     low_tags = concat(map(read_taglist_byrun(beamline), runs))
     return hi_tag, low_tags
Example #27
0
def recmerge(*objs, merge_sequences=False):
    """Recursively merge an arbitrary number of collections. For conflicting
    values, later collections to the right are given priority. By default
    (merge_sequences=False), sequences are treated as a normal value and not
    merged.

    Args:
        *objs: collections to merge
        merge_sequences: whether to merge values that are sequences

    Returns: merged collection
    """
    if isinstance(objs, tuple) and len(objs) == 1:
        # A squeeze operation since merge_with generates tuple(list_of_objs,)
        objs = objs[0]
    if all([isinstance(obj, Mapping) for obj in objs]):
        # Merges all the collections, recursively applies merging to the combined values
        return tz.merge_with(
            partial(recmerge, merge_sequences=merge_sequences), *objs)
    elif all([isinstance(obj, Sequence) for obj in objs]) and merge_sequences:
        # Merges sequence values by concatenation
        return list(tz.concat(objs))
    else:
        # If colls does not contain mappings, simply pick the last one
        return tz.last(objs)
Example #28
0
        def prepro(tokenizer, d, max_len=512):
            """ make sure data is not empty"""
            source_sents, extracts = d
            tokenized_sents = [
                tokenizer.tokenize(source_sent.lower())
                for source_sent in source_sents
            ]
            tokenized_sents = [
                tokenized_sent + ['[SEP]']
                for tokenized_sent in tokenized_sents
            ]
            tokenized_sents[0] = ['[CLS]'] + tokenized_sents[0]
            word_num = [
                len(tokenized_sent) for tokenized_sent in tokenized_sents
            ]
            truncated_word_num = []
            total_count = 0
            for num in word_num:
                if total_count + num < max_len:
                    truncated_word_num.append(num)
                else:
                    truncated_word_num.append(512 - total_count)
                    break
                total_count += num
            tokenized_sents = list(concat(tokenized_sents))[:max_len]
            tokenized_sents = tokenizer.convert_tokens_to_ids(tokenized_sents)
            abs_sents = tokenize(None, extracts)
            art_sents = tokenize(None, source_sents)

            return (art_sents, tokenized_sents, truncated_word_num), abs_sents
Example #29
0
    async def check_incoming_eth(self, new_blocks: List[AttributeDict]):
        watch_addresses: Set[str] = self._watch_addresses
        filtered_blocks: List[AttributeDict] = [block for block in new_blocks if block is not None]
        block_to_timestamp: Dict[str, float] = dict((block.hash, float(block.timestamp))
                                                    for block in filtered_blocks)
        transactions: List[AttributeDict] = list(cytoolz.concat(b.transactions for b in filtered_blocks))
        incoming_eth_transactions: List[AttributeDict] = [t for t in transactions
                                                          if ((t.get("to") in watch_addresses) and
                                                              (t.get("value", 0) > 0))]

        for incoming_transaction in incoming_eth_transactions:
            # Filter out failed transactions.
            receipt: AttributeDict = self._w3.eth.getTransactionReceipt(incoming_transaction.hash)
            if receipt.status != 1:
                continue

            # Emit event.
            raw_eth_value: int = incoming_transaction.get("value")
            eth_value: float = raw_eth_value * 1e-18
            from_address: str = incoming_transaction.get("from")
            to_address: str = incoming_transaction.get("to")
            timestamp: float = block_to_timestamp[incoming_transaction.get("blockHash")]
            self.trigger_event(IncomingEthWatcherEvent.ReceivedEther,
                               WalletReceivedAssetEvent(timestamp, incoming_transaction.hash.hex(),
                                                        from_address, to_address, "ETH", eth_value, raw_eth_value))
Example #30
0
def itm_rank_collate(inputs):
    (
        input_ids,
        img_feats,
        img_pos_feats,
        attn_masks,
    ) = map(list, unzip(concat(i for i in inputs)))

    txt_lens = [i.size(0) for i in input_ids]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    position_ids = torch.arange(0, input_ids.size(1),
                                dtype=torch.long).unsqueeze(0)

    num_bbs = [f.size(0) for f in img_feats]
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    sample_size = len(inputs[0])
    assert all(sample_size == len(i) for i in inputs)

    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    batch = {
        'input_ids': input_ids,
        'position_ids': position_ids,
        'img_feat': img_feat,
        'img_pos_feat': img_pos_feat,
        'attn_masks': attn_masks,
        'gather_index': gather_index,
        'sample_size': sample_size
    }
    return batch
Example #31
0
File: tvc.py Project: zhixinma/HERO
    def collate(inputs):
        (video_inputs, all_clip_ranges, attn_masks_list,
         metas) = map(list, unzip(inputs))

        all_attn_masks = list(concat(attn_masks_list))
        attn_mask = pad_sequence(all_attn_masks,
                                 batch_first=True,
                                 padding_value=0)
        batch = {
            'cap_attn_mask': attn_mask,
            'clip_ranges': tuple(map(tuple, all_clip_ranges))
        }

        vid_batch = video_collate(video_inputs)
        batch.update(vid_batch)

        # meta
        vids, clip_ids, all_ts = [], [], []
        for vid, cids, tss in metas:
            for cid, ts in zip(cids, tss):
                vids.append(vid)
                clip_ids.append(int(cid))
                all_ts.append(ts)
        batch['vid_names'] = vids
        batch['clip_ids'] = clip_ids
        batch['all_ts'] = all_ts
        return batch
Example #32
0
def vcr_collate(inputs):
    (input_ids, txt_type_ids, img_feats, img_pos_feats, attn_masks,
     targets) = map(list, unzip(concat(inputs)))

    txt_lens = [i.size(0) for i in input_ids]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    txt_type_ids = pad_sequence(txt_type_ids,
                                batch_first=True,
                                padding_value=0)
    position_ids = torch.arange(0, input_ids.size(1),
                                dtype=torch.long).unsqueeze(0)

    # image batches
    num_bbs = [f.size(0) for f in img_feats]
    img_feat = pad_tensors(img_feats, num_bbs)
    img_pos_feat = pad_tensors(img_pos_feats, num_bbs)

    attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0)
    targets = torch.stack(targets, dim=0)

    bs, max_tl = input_ids.size()
    out_size = attn_masks.size(1)
    gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size)

    batch = {
        'input_ids': input_ids,
        'txt_type_ids': txt_type_ids,
        'position_ids': position_ids,
        'img_feat': img_feat,
        'img_pos_feat': img_pos_feat,
        'attn_masks': attn_masks,
        'gather_index': gather_index,
        'targets': targets
    }
    return batch
Example #33
0
def async_requests(
    url_payload: List[Tuple[str, Optional[MutableMapping[str, Any]]]],
    read: str,
    request: str = "GET",
    max_workers: int = 8,
) -> List[Union[str, MutableMapping[str, Any], bytes]]:
    """Send async requests.

    This function is based on
    `this <https://github.com/HydrologicEngineeringCenter/data-retrieval-scripts/blob/master/qpe_async_download.py>`__
    script.

    Parameters
    ----------
    url_payload : list of tuples
        A list of URLs and payloads as a tuple.
    read : str
        The method for returning the request; binary, json, and text.
    request : str, optional
        The request type; GET or POST, defaults to GET.
    max_workers : int, optional
        The maximum number of async processes, defaults to 8.

    Returns
    -------
    list
        A list of responses
    """
    chunked_urls = tlz.partition_all(max_workers, url_payload)

    results = (asyncio.get_event_loop().run_until_complete(
        _async_session(c, read, request)) for c in chunked_urls)
    return list(tlz.concat(results))
Example #34
0
File: tvc.py Project: zhixinma/HERO
    def collate(inputs):
        video_inputs, all_clip_ranges, cap_inputs = map(list, unzip(inputs))

        (all_input_ids, all_tgt_ids,
         all_attn_masks) = map(list,
                               unzip(concat(outs for outs in cap_inputs)))
        input_ids = pad_sequence(all_input_ids,
                                 batch_first=True,
                                 padding_value=1)
        position_ids = torch.arange(0, input_ids.size(1),
                                    dtype=torch.long).unsqueeze(0)
        tgt_ids = pad_sequence(all_tgt_ids, batch_first=True, padding_value=-1)
        attn_mask = pad_sequence(all_attn_masks,
                                 batch_first=True,
                                 padding_value=0)
        batch = {
            'cap_input_ids': input_ids,
            'cap_pos_ids': position_ids,
            'cap_tgt_ids': tgt_ids,
            'cap_attn_mask': attn_mask,
            'clip_ranges': tuple(map(tuple, all_clip_ranges))
        }

        vid_batch = video_collate(video_inputs)
        batch.update(vid_batch)
        return batch
Example #35
0
def compute_up(t, seq, **kwargs):
    try:
        row = first(seq)
    except StopIteration:
        return ()
    seq = concat([[row], seq]) # re-add row to seq

    if isinstance(row, list):
        seq = map(tuple, seq)

    return unique(seq)
Example #36
0
def next_search_beam(beam, beam_size, finished,
                     end, topk, lp, hists, attn=None, diverse=1.0):
    """generate the next beam(K-best hyps)"""
    topks, lps, hists_list, attns = _unpack_topk(topk, lp, hists, attn)
    hyps_lists = [h.extend_k(topks[i], lps[i],
                             hists_list[i], attns[i], diverse)
                  for i, h in enumerate(beam)]
    hyps = list(concat(hyps_lists))
    finished, beam = _clean_beam(finished, hyps, end, beam_size)

    return finished, beam
Example #37
0
def batchify_fn_extract_ff(pad, data, cuda=True):
    source_lists, targets = tuple(map(list, unzip(data)))

    src_nums = list(map(len, source_lists))
    sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists))

    tensor_type = torch.cuda.FloatTensor if cuda else torch.FloatTensor
    target = tensor_type(list(concat(targets)))

    fw_args = (sources, src_nums)
    loss_args = (target, )
    return fw_args, loss_args
Example #38
0
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1,)),
                         consolidate=None):
    """ Find block dimensions from arguments

    Parameters
    ----------
    argpairs: iterable
        name, ijk index pairs
    numblocks: dict
        maps {name: number of blocks}
    sentinels: iterable (optional)
        values for singleton dimensions
    consolidate: func (optional)
        use this to reduce each set of common blocks into a smaller set

    Examples
    --------
    >>> argpairs = [('x', 'ij'), ('y', 'ji')]
    >>> numblocks = {'x': (2, 3), 'y': (3, 2)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Supports numpy broadcasting rules

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> numblocks = {'x': (2, 1), 'y': (1, 3)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Works in other contexts too

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))}
    >>> broadcast_dimensions(argpairs, d)
    {'i': 'Hello', 'j': (2, 3)}
    """
    # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)]
    argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None]
    L = toolz.concat([zip(inds, dims) for (x, inds), (x, dims)
                     in toolz.join(toolz.first, argpairs2, toolz.first, numblocks.items())])

    g = toolz.groupby(0, L)
    g = dict((k, set([d for i, d in v])) for k, v in g.items())

    g2 = dict((k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items())

    if consolidate:
        return toolz.valmap(consolidate, g2)

    if g2 and not set(map(len, g2.values())) == set([1]):
        raise ValueError("Shapes do not align %s" % g)

    return toolz.valmap(toolz.first, g2)
Example #39
0
def compute_up(t, seq, **kwargs):
    if t.on:
        raise NotImplementedError("python backend cannot specify what columns to distinct on")
    try:
        row = toolz.first(seq)
    except StopIteration:
        return ()
    seq = concat([[row], seq])  # re-add row to seq

    if isinstance(row, list):
        seq = map(tuple, seq)

    return unique(seq)
Example #40
0
def pre_compute(expr, seq):
    try:
        if isinstance(seq, Iterator):
            first = next(seq)
            seq = concat([[first], seq])
        else:
            first = next(iter(seq))
    except StopIteration:
        return []
    if isinstance(first, dict):
        return pluck(expr.fields, seq)
    else:
        return seq
Example #41
0
def pre_compute(expr, seq, scope=None, **kwargs):
    try:
        if isinstance(seq, Iterator):
            first = next(seq)
            seq = concat([[first], seq])
        else:
            first = next(iter(seq))
    except StopIteration:
        return []
    if isinstance(first, dict):
        leaf = expr._leaves()[0]
        return pluck(leaf.fields, seq)
    else:
        return seq
Example #42
0
 def _dict(self):
     if hasattr(self, '_cached_dict'):
         return self._cached_dict
     else:
         keys = tuple(map(blockwise_token, range(len(self.indices))))
         func = SubgraphCallable(self.dsk, self.output, keys)
         self._cached_dict = make_blockwise_graph(
             func,
             self.output,
             self.output_indices,
             *list(toolz.concat(self.indices)),
             new_axes=self.new_axes,
             numblocks=self.numblocks,
             concatenate=self.concatenate
         )
     return self._cached_dict
Example #43
0
File: core.py Project: OspreyX/dask
def isempty(seq):
    """ Is the sequence empty?

    >>> seq = iter([1, 2, 3])
    >>> empty, seq = isempty(seq)
    >>> empty
    False

    >>> list(seq)  # seq is preserved
    [1, 2, 3]

    >>> seq = iter([])
    >>> empty, seq = isempty(seq)
    >>> empty
    True
    """
    try:
        first = next(seq)
        return False, concat([[first], seq])
    except StopIteration:
        return True, False
Example #44
0
def call_function(func, func_token, args, kwargs, pure=None, nout=None):
    dask_key_name = kwargs.pop('dask_key_name', None)
    pure = kwargs.pop('pure', pure)

    if dask_key_name is None:
        name = '%s-%s' % (funcname(func),
                          tokenize(func_token, *args, pure=pure, **kwargs))
    else:
        name = dask_key_name

    args2, collections = unzip(map(unpack_collections, args), 2)
    collections = list(concat(collections))

    if kwargs:
        dask_kwargs, collections2 = unpack_collections(kwargs)
        collections.extend(collections2)
        task = (apply, func, list(args2), dask_kwargs)
    else:
        task = (func,) + args2

    graph = HighLevelGraph.from_collections(name, {name: task},
                                            dependencies=collections)
    nout = nout if nout is not None else None
    return Delayed(name, graph, length=nout)
Example #45
0
def ordered_intersect(*sets):
    """Set intersection of two sequences that preserves order.

    Parameters
    ----------
    sets : tuple of Sequence

    Returns
    -------
    generator

    Examples
    --------
    >>> list(ordered_intersect('abcd', 'cdef'))
    ['c', 'd']
    >>> list(ordered_intersect('bcda', 'bdfga'))
    ['b', 'd', 'a']
    >>> list(ordered_intersect('zega', 'age'))  # 1st sequence determines order
    ['e', 'g', 'a']
    >>> list(ordered_intersect('gah', 'bag', 'carge'))
    ['g', 'a']
    """
    common = frozenset.intersection(*map(frozenset, sets))
    return (x for x in unique(concat(sets)) if x in common)
Example #46
0
            # instead of modifying/using global state, choosing to pass in
            # the updated request as a param means that the handler functions
            # are all pure functions of their input params.
            #
            # This should make testing them easier - it's one less thing to mock.
            return req_fun(t.merge(opts, r), *args, **kwargs)

        return requirejson_wrapper
    return reqjson

# converts a dictionary to flat list of key/value pairs.
# each key can have multiple values and they will all be unpacked accordingly.
multipairs=lambda d: list(t.concat(t.map(
                          lambda i: (lambda k,v: t.concat((k,e) for e in v)
                                                if isinstance(v,list)
                                                else (k,v))(i[0],i[1]),
                      d.items())))

# --------------------------------------------------------------------------
#                                      REST API
# --------------------------------------------------------------------------
@app.get('/')
def default(message=''):
  return template('signin', message=message)

@app.post('/signin')
@params(keys=['barcode'])
def signin(p):
  u = filter(lambda v: v.id == p['barcode'], data['users'].values())
  if len(u) > 0:
def rerank_mp(all_beams, ext_inds):
    beam_lists = [all_beams[i: i+n] for i, n in ext_inds if n > 0]
    with mp.Pool(8) as pool:
        reranked = pool.map(rerank_one, beam_lists)
    return list(concat(reranked))
Example #48
0

        
def rerank(all_beams, ext_inds):
    beam_lists = (all_beams[i: i+n] for i, n in ext_inds if n > 0)
    return list(concat(map(rerank_one, beam_lists)))
Example #50
0
	def recursive_beam(self, previous_start, line, i, line_length):

		go = False
		
		if len(previous_start) < 2:
			go = True
			
		if self.search_monitor.count(previous_start[0:2]) < 40:
			go = True
			
		if go == True:
			self.search_monitor.append(previous_start[0:2])
			#Progress down the line
			i += 1

			#Stop at the end
			if i < line_length:
				
				#For each available next path
				for start in [(1, line[i][0]), (2, line[i][1]), (3, line[i][2])]:
					
					#Create larger path
					try:
						previous_start = list(ct.concat(previous_start))

					except:
						previous_start = previous_start
						
					current_path = list(ct.concat([previous_start, start]))
					current_path = tuple(ct.partition(2, current_path))
					
					if len(current_path) > 2:
						test_path = current_path[-2:]
						current_dict = self.association_dict[test_path]
							
						if current_dict != {}:
									
							delta_p = max(current_dict["LR"], current_dict["RL"])
								
							if delta_p > self.delta_threshold:
								self.recursive_beam(current_path, line, i, line_length)
															
							#This is the end of a candidate sequence
							else:
								#Has to be at least 3 slots
								if len(current_path) > 3:
										
									#Remove the bad part
									current_path = current_path[0:-1]
									
									#Add to candidate_stack
									self.candidate_stack[i - len(current_path) + 1].append(current_path)

					else:
						current_dict = self.association_dict[current_path]

						if current_dict != {}:
							delta_p = max(current_dict["LR"], current_dict["RL"])
								
							if delta_p > self.delta_threshold:
								self.recursive_beam(current_path, line, i, line_length)
								
			return
Example #51
0
 def concat(self):
     return self.__class__(cytoolz.concat(self))
Example #52
0
def compute_up(t, example, children, **kwargs):
    return concat(children)
Example #53
0
def build_ngram_model(sentences, n, pad='<eos>'):
    """
    generates a dictionary of word-ngram counts from a list of sentences.
    """
    return frequencies( concat(ngrams(sent, n, pad) for sent in sentences) )
Example #54
0
def unpack_collections(expr):
    """Normalize a python object and merge all sub-graphs.

    - Replace ``Delayed`` with their keys
    - Convert literals to things the schedulers can handle
    - Extract dask graphs from all enclosed values

    Parameters
    ----------
    expr : object
        The object to be normalized. This function knows how to handle
        dask collections, as well as most builtin python types.

    Returns
    -------
    task : normalized task to be run
    collections : a tuple of collections

    Examples
    --------
    >>> a = delayed(1, 'a')
    >>> b = delayed(2, 'b')
    >>> task, collections = unpack_collections([a, b, 3])
    >>> task  # doctest: +SKIP
    ['a', 'b', 3]
    >>> collections  # doctest: +SKIP
    (a, b)

    >>> task, collections = unpack_collections({a: 1, b: 2})
    >>> task  # doctest: +SKIP
    (dict, [['a', 1], ['b', 2]])
    >>> collections  # doctest: +SKIP
    {a, b}
    """
    if isinstance(expr, Delayed):
        return expr._key, (expr,)

    if is_dask_collection(expr):
        finalized = finalize(expr)
        return finalized._key, (finalized,)

    if isinstance(expr, Iterator):
        expr = tuple(expr)

    typ = type(expr)

    if typ in (list, tuple, set):
        args, collections = unzip((unpack_collections(e) for e in expr), 2)
        args = list(args)
        collections = tuple(unique(concat(collections), key=id))
        # Ensure output type matches input type
        if typ is not list:
            args = (typ, args)
        return args, collections

    if typ is dict:
        args, collections = unpack_collections([[k, v] for k, v in expr.items()])
        return (dict, args), collections

    if typ is slice:
        args, collections = unpack_collections([expr.start, expr.stop, expr.step])
        return (slice,) + tuple(args), collections

    if is_dataclass(expr):
        args, collections = unpack_collections([[f.name, getattr(expr, f.name)] for f in
                                               dataclass_fields(expr)])

        return (apply, typ, (), (dict, args)), collections

    return expr, ()
Example #55
0
def apply_gufunc(func, signature, *args, **kwargs):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like np.vectorize, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    output_dtypes : dtype or list of dtypes, keyword only
        dtype or list of output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a, output_dtypes=2*(a.dtype,))
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, output_dtypes=a.dtype, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
    """
    output_dtypes = kwargs.pop("output_dtypes", None)
    output_sizes = kwargs.pop("output_sizes", None)
    vectorize = kwargs.pop("vectorize", None)
    allow_rechunk = kwargs.pop("allow_rechunk", False)

    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError('`signature` has to be of type string')
    core_input_dimss, core_output_dimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(core_output_dimss, list) else len(core_output_dimss)

    ## Assert output_dtypes
    if output_dtypes is None:
        raise ValueError("Must specify `output_dtypes` of output array(s)")
    elif isinstance(output_dtypes, str):
        otypes = list(output_dtypes)
        output_dtypes = otypes[0] if nout is None else otypes
    elif isinstance(output_dtypes, (tuple, list)):
        if nout is None:
            raise ValueError("Must specify single dtype for `output_dtypes` for function with one output")
        otypes = output_dtypes
    else:
        if nout is not None:
            raise ValueError("Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs")
        otypes = [output_dtypes]

    ## Vectorize function, if required
    if vectorize:
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(core_input_dimss) != len(args):
        ValueError("According to `signature`, `func` requires %d arguments, but %s given"
                   % (len(core_output_dimss), len(args)))

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [tuple(c[0] for c in a.chunks) for a in args]
    num_loopdims = [len(s) - len(cd) for s, cd in zip(input_shapes, core_input_dimss)]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    _core_input_shapes = [dict(zip(cid, s[n:])) for s, n, cid in zip(input_shapes, num_loopdims, core_input_dimss)]
    core_shapes = merge(output_sizes, *_core_input_shapes)

    loop_input_dimss = [tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, core_input_dimss)]

    loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else set()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            _dimsizes = dimsizess.get(dim, [])
            _dimsizes.append(size)
            dimsizess[dim] = _dimsizes
            _chunksizes = chunksizess.get(dim, [])
            _chunksizes.append(chunksize)
            chunksizess[dim] = _chunksizes
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes).union({1}) != {1, max(sizes)}:
            raise ValueError("Dimension `'{}'` with different lengths in arrays".format(dim))
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0] < core_shapes[dim]):
                raise ValueError("Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError("Dimension `'{}'` with different chunksize present".format(dim))

    ## Apply function - use atop here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `atop` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `atop` could improve things here.
    tmp = atop(func, loop_output_dims, *arginds,
               dtype=int,  # Only dummy dtype, anyone will do
               concatenate=True,
               **kwargs)

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    dsk = tmp.__dask_graph__()
    keys = list(flatten(tmp.__dask_keys__()))
    _anykey = keys[0]
    name, token = _anykey[0].split('-')

    ### *) Treat direct output
    if nout is None:
        core_output_dimss = [core_output_dimss]
        output_dtypes = [output_dtypes]

    ## Split output
    leaf_arrs = []
    for i, cod, odt in zip(count(0), core_output_dimss, output_dtypes):
        core_output_shape = tuple(core_shapes[d] for d in cod)
        core_chunkinds = len(cod) * (0,)
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name,) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys}
        leaf_arr = Array(sharedict.merge((leaf_name, leaf_dsk), dsk),
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         dtype=odt)
        leaf_arrs.append(leaf_arr)

    return leaf_arrs if nout else leaf_arrs[0]  # Undo *) from above
Example #56
0
def a2c_train_step(agent, abstractor, loader, opt, grad_fn,
                   gamma=0.99, reward_fn=compute_rouge_l,
                   stop_reward_fn=compute_rouge_n(n=1), stop_coeff=1.0):
    opt.zero_grad()
    indices = []
    probs = []
    baselines = []
    ext_sents = []
    art_batch, abs_batch = next(loader)
    for raw_arts in art_batch:
        (inds, ms), bs = agent(raw_arts)
        baselines.append(bs)
        indices.append(inds)
        probs.append(ms)
        ext_sents += [raw_arts[idx.item()]
                      for idx in inds if idx.item() < len(raw_arts)]
    with torch.no_grad():
        summaries = abstractor(ext_sents)
    i = 0
    rewards = []
    avg_reward = 0
    for inds, abss in zip(indices, abs_batch):
        rs = ([reward_fn(summaries[i+j], abss[j])
              for j in range(min(len(inds)-1, len(abss)))]
              + [0 for _ in range(max(0, len(inds)-1-len(abss)))]
              + [stop_coeff*stop_reward_fn(
                  list(concat(summaries[i:i+len(inds)-1])),
                  list(concat(abss)))])
        assert len(rs) == len(inds)
        avg_reward += rs[-1]/stop_coeff
        i += len(inds)-1
        # compute discounted rewards
        R = 0
        disc_rs = []
        for r in rs[::-1]:
            R = r + gamma * R
            disc_rs.insert(0, R)
        rewards += disc_rs
    indices = list(concat(indices))
    probs = list(concat(probs))
    baselines = list(concat(baselines))
    # standardize rewards
    reward = torch.Tensor(rewards).to(baselines[0].get_device())
    reward = (reward - reward.mean()) / (
        reward.std() + float(np.finfo(np.float32).eps))
    baseline = torch.cat(baselines).squeeze()
    avg_advantage = 0
    losses = []
    for action, p, r, b in zip(indices, probs, reward, baseline):
        advantage = r - b
        avg_advantage += advantage
        losses.append(-p.log_prob(action)
                      * (advantage/len(indices))) # divide by T*B
    critic_loss = F.mse_loss(baseline, reward)
    # backprop and update
    autograd.backward(
        [critic_loss] + losses,
        [torch.ones(1).to(critic_loss.get_device())]*(1+len(losses))
    )
    grad_log = grad_fn()
    opt.step()
    log_dict = {}
    log_dict.update(grad_log)
    log_dict['reward'] = avg_reward/len(art_batch)
    log_dict['advantage'] = avg_advantage.item()/len(indices)
    log_dict['mse'] = critic_loss.item()
    assert not math.isnan(log_dict['grad_norm'])
    return log_dict
Example #57
0
def apply_gufunc(func, signature, *args, **kwargs):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like np.vectorize, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    axes: List of tuples, optional, keyword only
        A list of tuples with indices of axes a generalized ufunc should operate on.
        For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for
        matrix multiplication, the base elements are two-dimensional matrices
        and these are taken to be stored in the two last axes of each argument. The
        corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
        For simplicity, for generalized ufuncs that operate on 1-dimensional arrays
        (vectors), a single integer is accepted instead of a single-element tuple,
        and for generalized ufuncs for which all outputs are scalars, the output
        tuples can be omitted.
    axis: int, optional, keyword only
        A single axis over which a generalized ufunc should operate. This is a short-cut
        for ufuncs that operate over a single, shared core dimension, equivalent to passing
        in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for
        all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing
        in ``axes=[(axis,), (axis,), ()]``.
    keepdims: bool, optional, keyword only
        If this is set to True, axes which are reduced over will be left in the result as
        a dimension with size one, so that the result will broadcast correctly against the
        inputs. This option can only be used for generalized ufuncs that operate on inputs
        that all have the same number of core dimensions and with outputs that have no core
        dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``.
        If used, the location of the dimensions in the output can be controlled with axes
        and axis.
    output_dtypes : Optional, dtype or list of dtypes, keyword only
        Valid numpy dtype specification or list thereof.
        If not given, a call of ``func`` with a small set of data
        is performed in order to try to  automatically determine the
        output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a)
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
    """
    axes = kwargs.pop("axes", None)
    axis = kwargs.pop("axis", None)
    keepdims = kwargs.pop("keepdims", False)
    output_dtypes = kwargs.pop("output_dtypes", None)
    output_sizes = kwargs.pop("output_sizes", None)
    vectorize = kwargs.pop("vectorize", None)
    allow_rechunk = kwargs.pop("allow_rechunk", False)

    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError('`signature` has to be of type string')
    input_coredimss, output_coredimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(output_coredimss, list) else len(output_coredimss)

    ## Determine and handle output_dtypes
    if output_dtypes is None:
        if vectorize:
            tempfunc = np.vectorize(func, signature=signature)
        else:
            tempfunc = func
        output_dtypes = apply_infer_dtype(tempfunc, args, kwargs, "apply_gufunc", "output_dtypes", nout)

    if isinstance(output_dtypes, (tuple, list)):
        if nout is None:
            if len(output_dtypes) > 1:
                raise ValueError(("Must specify single dtype or list of one dtype "
                                  "for `output_dtypes` for function with one output"))
            otypes = output_dtypes
            output_dtypes = output_dtypes[0]
        else:
            otypes = output_dtypes
    else:
        if nout is not None:
            raise ValueError("Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs")
        otypes = [output_dtypes]

    ## Vectorize function, if required
    if vectorize:
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    ## Axes
    input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims, input_coredimss, output_coredimss)

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(input_coredimss) != len(args):
        ValueError("According to `signature`, `func` requires %d arguments, but %s given"
                   % (len(input_coredimss), len(args)))

    ## Axes: transpose input arguments
    transposed_args = []
    for arg, iax, input_coredims in zip(args, input_axes, input_coredimss):
        shape = arg.shape
        iax = tuple(a if a < 0 else a - len(shape) for a in iax)
        tidc = tuple(i for i in range(-len(shape) + 0, 0) if i not in iax) + iax

        transposed_arg = arg.transpose(tidc)
        transposed_args.append(transposed_arg)
    args = transposed_args

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [a.chunks for a in args]
    num_loopdims = [len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss)]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    core_input_shapes = [dict(zip(icd, s[n:])) for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss)]
    core_shapes = merge(*core_input_shapes)
    core_shapes.update(output_sizes)

    loop_input_dimss = [tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)]

    loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else tuple()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            dimsizes = dimsizess.get(dim, [])
            dimsizes.append(size)
            dimsizess[dim] = dimsizes
            chunksizes_ = chunksizess.get(dim, [])
            chunksizes_.append(chunksize)
            chunksizess[dim] = chunksizes_
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes).union({1}) != {1, max(sizes)}:
            raise ValueError("Dimension `'{}'` with different lengths in arrays".format(dim))
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]):
                raise ValueError("Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError("Dimension `'{}'` with different chunksize present".format(dim))

    ## Apply function - use blockwise here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `blockwise` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `blockwise` could improve things here.
    tmp = blockwise(
        func,
        loop_output_dims,
        *arginds,
        dtype=int,  # Only dummy dtype, anyone will do
        concatenate=True,
        **kwargs
    )

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    keys = list(flatten(tmp.__dask_keys__()))
    name, token = keys[0][0].split('-')

    ### *) Treat direct output
    if nout is None:
        output_coredimss = [output_coredimss]
        output_dtypes = [output_dtypes]

    ## Split output
    leaf_arrs = []
    for i, ocd, odt, oax in zip(count(0), output_coredimss, output_dtypes, output_axes):
        core_output_shape = tuple(core_shapes[d] for d in ocd)
        core_chunkinds = len(ocd) * (0,)
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name,) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys}
        graph = HighLevelGraph.from_collections(leaf_name, leaf_dsk, dependencies=[tmp])
        leaf_arr = Array(graph,
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         dtype=odt)

        ### Axes:
        if keepdims:
            slices = len(leaf_arr.shape) * (slice(None),) + len(oax) * (np.newaxis,)
            leaf_arr = leaf_arr[slices]

        tidcs = [None] * len(leaf_arr.shape)
        for i, oa in zip(range(-len(oax), 0), oax):
            tidcs[oa] = i
        j = 0
        for i in range(len(tidcs)):
            if tidcs[i] is None:
                tidcs[i] = j
                j += 1
        leaf_arr = leaf_arr.transpose(tidcs)
        leaf_arrs.append(leaf_arr)

    return leaf_arrs if nout else leaf_arrs[0]  # Undo *) from above