def _merge(self, matches): # get matches up to and including first important_match # but if no important_match, then all matches are important_matches relevant_matches = self._first_important_matches(matches) # get individual lines from important_matches that were marked important # these will be prepended to the final result def get_marked_lines(match, marker): return tuple(line for line, flag in zip(match.value(self.__class__), match.valueflags(self.__class__)) if flag is marker) top_lines = concat(get_marked_lines(m, ParameterFlag.top) for m in relevant_matches) # also get lines that were marked as bottom, but reverse the match order so that lines # coming earlier will ultimately be last bottom_lines = concat(get_marked_lines(m, ParameterFlag.bottom) for m in reversed(relevant_matches)) # now, concat all lines, while reversing the matches # reverse because elements closer to the end of search path take precedence all_lines = concat(m.value(self.__class__) for m in reversed(relevant_matches)) # stack top_lines + all_lines, then de-dupe top_deduped = tuple(unique(concatv(top_lines, all_lines))) # take the top-deduped lines, reverse them, and concat with reversed bottom_lines # this gives us the reverse of the order we want, but almost there # NOTE: for a line value marked both top and bottom, the bottom marker will win out # for the top marker to win out, we'd need one additional de-dupe step bottom_deduped = unique(concatv(reversed(tuple(bottom_lines)), reversed(top_deduped))) # just reverse, and we're good to go return tuple(reversed(tuple(bottom_deduped)))
def convert_cat_codes(s, fmt): unq_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())])) c = (pd.to_numeric(s, downcast='integer').astype('category').pipe( lambda xf: xf.cat.rename_categories([ fmt[k] for k in sorted(xf.unique().dropna()) ])).cat.set_categories(unq_lvls)) return c
async def _process_headers(self, peer: ETHPeer, headers: List[BlockHeader]) -> int: await self._download_block_parts( [header for header in headers if not _is_body_empty(header)], self.request_bodies, self._downloaded_bodies, _body_key, 'body') self.logger.info("Got block bodies for chain segment") missing_receipts = [header for header in headers if not _is_receipts_empty(header)] # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. missing_receipts = list(unique(missing_receipts, key=_receipts_key)) await self._download_block_parts( missing_receipts, self.request_receipts, self._downloaded_receipts, _receipts_key, 'receipt') self.logger.info("Got block receipts for chain segment") # FIXME: Get the bodies returned by self._download_block_parts above and use persit_block # here. for header in headers: await self.chaindb.coro_persist_header(header) head = await self.chaindb.coro_get_canonical_head() self.logger.info("Imported chain segment, new head: #%d", head.block_number) if head.hash == peer.head_hash: self.logger.info("Fast sync with %s completed", peer) self._sync_complete.set() return head.block_number
def terms( doclike: types.DocLike, *, ngs: Optional[int | Collection[int] | types.DocLikeToSpans] = None, ents: Optional[bool | types.DocLikeToSpans] = None, ncs: Optional[bool | types.DocLikeToSpans] = None, dedupe: bool = True, ) -> Iterable[Span]: """ Extract one or multiple types of terms -- ngrams, entities, and/or noun chunks -- from ``doclike`` as a single, concatenated collection, with optional deduplication of spans extracted by more than one type. .. code-block:: pycon >>> extract.terms(doc, ngs=2, ents=True, ncs=True) >>> extract.terms(doc, ngs=lambda doc: extract.ngrams(doc, n=2)) >>> extract.terms(doc, ents=extract.entities) >>> extract.terms(doc, ents=partial(extract.entities, include_types="PERSON")) Args: doclike ngs: N-gram terms to be extracted. If one or multiple ints, :func:`textacy.extract.ngrams(doclike, n=ngs)` is used to extract terms; if a callable, ``ngs(doclike)`` is used to extract terms; if None, no n-gram terms are extracted. ents: Entity terms to be extracted. If True, :func:`textacy.extract.entities(doclike)` is used to extract terms; if a callable, ``ents(doclike)`` is used to extract terms; if None, no entity terms are extracted. ncs: Noun chunk terms to be extracted. If True, :func:`textacy.extract.noun_chunks(doclike)` is used to extract terms; if a callable, ``ncs(doclike)`` is used to extract terms; if None, no noun chunk terms are extracted. dedupe: If True, deduplicate terms whose spans are extracted by multiple types (e.g. a span that is both an n-gram and an entity), as identified by identical (start, stop) indexes in ``doclike``; otherwise, don't. Returns: Next term from ``doclike``, in order of n-grams then entities then noun chunks, with each collection's terms given in order of appearance. Note: This function is *not* to be confused with keyterm extraction, which leverages statistics and algorithms to quantify the "key"-ness of terms before returning the top-ranking terms. There is no such scoring or ranking here. See Also: - :func:`textacy.extact.ngrams()` - :func:`textacy.extact.entities()` - :func:`textacy.extact.noun_chunks()` - :mod:`textacy.extact.keyterms` """ extractors = _get_extractors(ngs, ents, ncs) terms_ = itertoolz.concat(extractor(doclike) for extractor in extractors) if dedupe is True: terms_ = itertoolz.unique(terms_, lambda span: (span.start, span.end)) for term in terms_: yield term
def conda_build_local_paths(self): # does file system reads to make sure paths actually exist return tuple( unique(full_path for full_path in (expand(d) for d in ( self._croot, self.bld_path, self.conda_build.get('root-dir'), join(self.root_prefix, 'conda-bld'), '~/conda-bld', ) if d) if isdir(full_path)))
def conda_build_local_paths(self): # does file system reads to make sure paths actually exist return tuple(unique(full_path for full_path in ( expand(d) for d in ( self._croot, self.bld_path, self.conda_build.get('root-dir'), join(self.root_prefix, 'conda-bld'), '~/conda-bld', ) if d ) if isdir(full_path)))
def validate_cats_for_fmt(x, fmtid, convfn): fmt = FORMATS[fmtid] fmt_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())])) xs = pd.Series(x, name=fmtid) xc = convfn(xs, fmt) assert type(xc) == type(xs) assert xc.dtype.name == 'category' assert list(xc.cat.categories) == fmt_lvls vc = xc.value_counts().to_dict() assert set(vc.keys()) == set(fmt.values()) return xc
def _skip_empty_and_duplicated_receipts( self, headers: List[BlockHeader]) -> Generator[BlockHeader, None, None]: # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), so # we have an extra check here to avoid requesting those receipts multiple times. headers = list(unique(headers, key=operator.attrgetter('receipt_root'))) for header in headers: if (header.receipt_root != self.chaindb.empty_root_hash and header.receipt_root not in self._pending_receipts): yield header
def n_unique_words(doc_or_words: Union[Doc, Iterable[Token]]) -> int: """ Compute the number of *unique* words in a document. Args: doc_or_words: If a spaCy ``Doc``, non-punctuation tokens (words) are extracted; if an iterable of spaCy ``Token`` s, all are included as-is. """ words = _get_words(doc_or_words) # NOTE: this stdlib solution is slower than itertoolz for docs with ~250+ words # so let's take a small hit on short docs for the sake of big wins on long docs # return len({word.lower for word in words}) return itertoolz.count(itertoolz.unique(word.lower for word in words))
async def _process_headers(self, peer: ETHPeer, headers: List[BlockHeader]) -> int: await self._download_block_parts( [header for header in headers if not _is_body_empty(header)], self.request_bodies, self._downloaded_bodies, _body_key, 'body') self.logger.info("Got block bodies for chain segment") missing_receipts = [header for header in headers if not _is_receipts_empty(header)] # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. missing_receipts = list(unique(missing_receipts, key=_receipts_key)) await self._download_block_parts( missing_receipts, self.request_receipts, self._downloaded_receipts, _receipts_key, 'receipt') self.logger.info("Got block receipts for chain segment") # FIXME: Get the bodies returned by self._download_block_parts above and use persit_block # here. for header in headers: await self.chaindb.coro_persist_header(header) head = await self.chaindb.coro_get_canonical_head() self.logger.info("Imported chain segment, new head: #%d", head.block_number) # Quite often the header batch we receive here includes headers past the peer's reported # head (via the NewBlock msg), so we can't compare our head's hash to the peer's in # order to see if the sync is completed. Instead we just check that we have the peer's # head_hash in our chain. try: await self.chaindb.coro_get_block_header_by_hash(peer.head_hash) except HeaderNotFound: pass else: self.logger.info("Fast sync with %s completed", peer) self._sync_complete.set() return head.block_number
def transform( self, doclikes: Iterable[types.DocLike]) -> Iterable[Tuple[str, ...]]: """ Convert a sequence of spaCy Docs or Spans into an ordered, nested sequence of terms as strings. Args: doclikes Yields: Ordered sequence of terms as strings for next Doc or Span. """ normalize_ = self.normalize for doclike in doclikes: terms = itertoolz.concat( tokenizer(doclike) for tokenizer in self.tokenizers) if self.dedupe is True: terms = itertoolz.unique(terms, lambda span: (span.start, span.end)) yield tuple(normalize_(term) for term in terms)
def test_unique(): assert tuple(unique((1, 2, 3))) == (1, 2, 3) assert tuple(unique((1, 2, 1, 3))) == (1, 2, 3) assert tuple(unique((1, 2, 3), key=iseven)) == (1, 2)
async def _sync(self, peer: ETHPeer) -> None: head = await self.chaindb.coro_get_canonical_head() head_td = await self.chaindb.coro_get_score(head.hash) if peer.head_td <= head_td: self.logger.info( "Head TD (%d) announced by %s not higher than ours (%d), not syncing", peer.head_td, peer, head_td) return self.logger.info("Starting sync with %s", peer) # FIXME: Fetch a batch of headers, in reverse order, starting from our current head, and # find the common ancestor between our chain and the peer's. start_at = max(0, head.block_number - eth.MAX_HEADERS_FETCH) while True: self.logger.info("Fetching chain segment starting at #%d", start_at) peer.sub_proto.send_get_block_headers(start_at, eth.MAX_HEADERS_FETCH, reverse=False) try: headers = await wait_with_token(self._new_headers.get(), peer.wait_until_finished(), token=self.cancel_token, timeout=self._reply_timeout) except TimeoutError: self.logger.warn( "Timeout waiting for header batch from %s, aborting sync", peer) await peer.stop() break if peer.is_finished(): self.logger.info("%s disconnected, aborting sync", peer) break self.logger.info("Got headers segment starting at #%d", start_at) # TODO: Process headers for consistency. await self._download_block_parts( [header for header in headers if not _is_body_empty(header)], self.request_bodies, self._downloaded_bodies, _body_key, 'body') self.logger.info( "Got block bodies for chain segment starting at #%d", start_at) missing_receipts = [ header for header in headers if not _is_receipts_empty(header) ] # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. missing_receipts = list(unique(missing_receipts, key=_receipts_key)) await self._download_block_parts(missing_receipts, self.request_receipts, self._downloaded_receipts, _receipts_key, 'receipt') self.logger.info( "Got block receipts for chain segment starting at #%d", start_at) for header in headers: await self.chaindb.coro_persist_header(header) start_at = header.block_number + 1 self.logger.info("Imported chain segment, new head: #%d", start_at - 1) head = await self.chaindb.coro_get_canonical_head() if head.hash == peer.head_hash: self.logger.info("Chain sync with %s completed", peer) self._sync_complete.set() break
def convert_cat_force(s, fmt): unq_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())])) c = (pd.to_numeric(s, downcast='integer').replace( to_replace=fmt).astype('category').cat.set_categories(unq_lvls)) return c