def _merge(self, matches): # get matches up to and including first important_match # but if no important_match, then all matches are important_matches relevant_matches = self._first_important_matches(matches) # get individual lines from important_matches that were marked important # these will be prepended to the final result def get_marked_lines(match, marker): return tuple(line for line, flag in zip(match.value(self.__class__), match.valueflags(self.__class__)) if flag is marker) top_lines = concat(get_marked_lines(m, ParameterFlag.top) for m in relevant_matches) # also get lines that were marked as bottom, but reverse the match order so that lines # coming earlier will ultimately be last bottom_lines = concat(get_marked_lines(m, ParameterFlag.bottom) for m in reversed(relevant_matches)) # now, concat all lines, while reversing the matches # reverse because elements closer to the end of search path take precedence all_lines = concat(m.value(self.__class__) for m in reversed(relevant_matches)) # stack top_lines + all_lines, then de-dupe top_deduped = tuple(unique(concatv(top_lines, all_lines))) # take the top-deduped lines, reverse them, and concat with reversed bottom_lines # this gives us the reverse of the order we want, but almost there # NOTE: for a line value marked both top and bottom, the bottom marker will win out # for the top marker to win out, we'd need one additional de-dupe step bottom_deduped = unique(concatv(reversed(tuple(bottom_lines)), reversed(top_deduped))) # just reverse, and we're good to go return tuple(reversed(tuple(bottom_deduped)))
def _verify(cls, prefix_setups, prefix_action_groups): exceptions = tuple(exc for exc in concatv( concat(cls._verify_individual_level(prefix_group) for prefix_group in itervalues(prefix_action_groups)), concat(cls._verify_prefix_level(target_prefix, prefix_group) for target_prefix, prefix_group in iteritems(prefix_action_groups)), cls._verify_transaction_level(prefix_setups), ) if exc) return exceptions
def custom_channels(self): from ..models.channel import Channel custom_channels = (Channel.make_simple_channel(self.channel_alias, url, name) for name, url in iteritems(self._custom_channels)) all_sources = self.default_channels, (self.local_build_root_channel,), custom_channels all_channels = (ch for ch in concat(all_sources)) return odict((x.name, x) for x in all_channels)
def query(self, package_ref_or_match_spec): if not self._loaded: self.load() param = package_ref_or_match_spec if isinstance(param, string_types): param = MatchSpec(param) if isinstance(param, MatchSpec): if param.get_exact_value('name'): package_name = param.get_exact_value('name') for prec in self._names_index[package_name]: if param.match(prec): yield prec elif param.get_exact_value('track_features'): track_features = param.get_exact_value('track') or () candidates = concat(self._track_features_index[feature_name] for feature_name in track_features) for prec in candidates: if param.match(prec): yield prec else: for prec in self._package_records: if param.match(prec): yield prec else: assert isinstance(param, PackageRef) for prec in self._names_index[param.name]: if prec == param: yield prec
def custom_multichannels(self): from ..models.channel import Channel reserved_multichannel_urls = odict(( (DEFAULTS_CHANNEL_NAME, self._default_channels), ('local', self.conda_build_local_urls), )) reserved_multichannels = odict( (name, tuple( Channel.make_simple_channel(self.channel_alias, url) for url in urls) ) for name, urls in iteritems(reserved_multichannel_urls) ) custom_multichannels = odict( (name, tuple( Channel.make_simple_channel(self.channel_alias, url) for url in urls) ) for name, urls in iteritems(self._custom_multichannels) ) all_multichannels = odict( (name, channels) for name, channels in concat(map(iteritems, ( custom_multichannels, reserved_multichannels, # reserved comes last, so reserved overrides custom ))) ) return all_multichannels
def get_pfe(self): from .package_cache import ProgressiveFetchExtract if not self.prefix_setups: return ProgressiveFetchExtract(()) else: link_precs = set(concat(stp.link_precs for stp in itervalues(self.prefix_setups))) return ProgressiveFetchExtract(link_precs)
def query_all(cls, package_ref_or_match_spec, pkgs_dirs=None): if pkgs_dirs is None: pkgs_dirs = context.pkgs_dirs return concat( pcache.query(package_ref_or_match_spec) for pcache in cls.all_caches_writable_first(pkgs_dirs))
def terms( doclike: types.DocLike, *, ngs: Optional[int | Collection[int] | types.DocLikeToSpans] = None, ents: Optional[bool | types.DocLikeToSpans] = None, ncs: Optional[bool | types.DocLikeToSpans] = None, dedupe: bool = True, ) -> Iterable[Span]: """ Extract one or multiple types of terms -- ngrams, entities, and/or noun chunks -- from ``doclike`` as a single, concatenated collection, with optional deduplication of spans extracted by more than one type. .. code-block:: pycon >>> extract.terms(doc, ngs=2, ents=True, ncs=True) >>> extract.terms(doc, ngs=lambda doc: extract.ngrams(doc, n=2)) >>> extract.terms(doc, ents=extract.entities) >>> extract.terms(doc, ents=partial(extract.entities, include_types="PERSON")) Args: doclike ngs: N-gram terms to be extracted. If one or multiple ints, :func:`textacy.extract.ngrams(doclike, n=ngs)` is used to extract terms; if a callable, ``ngs(doclike)`` is used to extract terms; if None, no n-gram terms are extracted. ents: Entity terms to be extracted. If True, :func:`textacy.extract.entities(doclike)` is used to extract terms; if a callable, ``ents(doclike)`` is used to extract terms; if None, no entity terms are extracted. ncs: Noun chunk terms to be extracted. If True, :func:`textacy.extract.noun_chunks(doclike)` is used to extract terms; if a callable, ``ncs(doclike)`` is used to extract terms; if None, no noun chunk terms are extracted. dedupe: If True, deduplicate terms whose spans are extracted by multiple types (e.g. a span that is both an n-gram and an entity), as identified by identical (start, stop) indexes in ``doclike``; otherwise, don't. Returns: Next term from ``doclike``, in order of n-grams then entities then noun chunks, with each collection's terms given in order of appearance. Note: This function is *not* to be confused with keyterm extraction, which leverages statistics and algorithms to quantify the "key"-ness of terms before returning the top-ranking terms. There is no such scoring or ranking here. See Also: - :func:`textacy.extact.ngrams()` - :func:`textacy.extact.entities()` - :func:`textacy.extact.noun_chunks()` - :mod:`textacy.extact.keyterms` """ extractors = _get_extractors(ngs, ents, ncs) terms_ = itertoolz.concat(extractor(doclike) for extractor in extractors) if dedupe is True: terms_ = itertoolz.unique(terms_, lambda span: (span.start, span.end)) for term in terms_: yield term
def query_all(cls, package_ref_or_match_spec, pkgs_dirs=None): if pkgs_dirs is None: pkgs_dirs = context.pkgs_dirs return concat(pcache.query(package_ref_or_match_spec) for pcache in concatv( cls.writable_caches(pkgs_dirs), cls.read_only_caches(pkgs_dirs), ))
def explode_directories(child_directories, already_split=False): # get all directories including parents # use already_split=True for the result of get_all_directories() maybe_split = lambda x: x if already_split else x.split('/') return set( concat( accumulate(join, maybe_split(directory)) for directory in child_directories))
def custom_channels(self): from ..models.channel import Channel custom_channels = ( Channel.make_simple_channel(self.channel_alias, url, name) for name, url in iteritems(self._custom_channels) ) channels_from_multichannels = concat(channel for channel in itervalues(self.custom_multichannels)) all_channels = odict((x.name, x) for x in (ch for ch in concatv(channels_from_multichannels, custom_channels))) return all_channels
def custom_multichannels(self): from ..models.channel import Channel default_custom_multichannels = { 'defaults': self.default_channels, 'local': (self.local_build_root_channel,), } all_channels = default_custom_multichannels, self._custom_multichannels return odict((name, tuple(Channel(v) for v in c)) for name, c in concat(map(iteritems, all_channels)))
def query_all(channels, subdirs, package_ref_or_match_spec): from .index import check_whitelist # TODO: fix in-line import channel_urls = all_channel_urls(channels, subdirs=subdirs) check_whitelist(channel_urls) with ThreadLimitedThreadPoolExecutor() as executor: futures = tuple(executor.submit( SubdirData(Channel(url)).query, package_ref_or_match_spec ) for url in channel_urls) return tuple(concat(future.result() for future in as_completed(futures)))
def custom_multichannels(self): from ..models.channel import Channel default_custom_multichannels = { 'defaults': self.default_channels, 'local': (self.local_build_root_channel, ), } all_channels = default_custom_multichannels, self._custom_multichannels return odict((name, tuple(Channel(v) for v in c)) for name, c in concat(map(iteritems, all_channels)))
def check_whitelist(channel_urls): if context.whitelist_channels: whitelist_channel_urls = tuple( concat(Channel(c).base_urls for c in context.whitelist_channels)) for url in channel_urls: these_urls = Channel(url).base_urls if not all(this_url in whitelist_channel_urls for this_url in these_urls): raise ChannelNotAllowed(Channel(url))
def _merge(self, matches): # get matches up to and including first important_match # but if no important_match, then all matches are important_matches relevant_matches_and_values = tuple( (match, match.value(self)) for match in self._first_important_matches(matches)) for match, value in relevant_matches_and_values: if not isinstance(value, tuple): raise InvalidTypeError(self.name, value, match.source, value.__class__.__name__, self._type.__name__) # get individual lines from important_matches that were marked important # these will be prepended to the final result def get_marked_lines(match, marker, parameter_obj): return tuple(line for line, flag in zip( match.value(parameter_obj), match.valueflags(parameter_obj)) if flag is marker) if match else () top_lines = concat( get_marked_lines(m, ParameterFlag.top, self) for m, _ in relevant_matches_and_values) # also get lines that were marked as bottom, but reverse the match order so that lines # coming earlier will ultimately be last bottom_lines = concat( get_marked_lines(m, ParameterFlag.bottom, self) for m, _ in reversed(relevant_matches_and_values)) # now, concat all lines, while reversing the matches # reverse because elements closer to the end of search path take precedence all_lines = concat(v for _, v in reversed(relevant_matches_and_values)) # stack top_lines + all_lines, then de-dupe top_deduped = tuple(unique(concatv(top_lines, all_lines))) # take the top-deduped lines, reverse them, and concat with reversed bottom_lines # this gives us the reverse of the order we want, but almost there # NOTE: for a line value marked both top and bottom, the bottom marker will win out # for the top marker to win out, we'd need one additional de-dupe step bottom_deduped = unique( concatv(reversed(tuple(bottom_lines)), reversed(top_deduped))) # just reverse, and we're good to go return tuple(reversed(tuple(bottom_deduped)))
def get_pfe(self): from .package_cache import ProgressiveFetchExtract if not self.prefix_setups: return ProgressiveFetchExtract({}, ()) else: index = next(itervalues(self.prefix_setups)).index link_dists = set( concat(stp.link_dists for stp in itervalues(self.prefix_setups))) return ProgressiveFetchExtract(index, link_dists)
def execute(self): if not self._verified: self.verify() assert not context.dry_run try: self._execute(tuple(concat(interleave(itervalues(self.prefix_action_groups))))) finally: rm_rf(self.transaction_context['temp_dir'])
def _get_pfe(self): from .package_cache_data import ProgressiveFetchExtract if self._pfe is not None: pfe = self._pfe elif not self.prefix_setups: self._pfe = pfe = ProgressiveFetchExtract(()) else: link_precs = set(concat(stp.link_precs for stp in itervalues(self.prefix_setups))) self._pfe = pfe = ProgressiveFetchExtract(link_precs) return pfe
def _make_channel_priorities(channels): priorities_map = odict() for priority_counter, chn in enumerate(concat( (Channel(cc) for cc in c._channels) if isinstance(c, MultiChannel) else (c,) for c in (Channel(c) for c in channels) )): channel_name = chn.name if channel_name in priorities_map: continue priorities_map[channel_name] = min(priority_counter, MAX_CHANNEL_PRIORITY - 1) return priorities_map
def custom_channels(self): from ..models.channel import Channel custom_channels = (Channel.make_simple_channel(self.channel_alias, url, name) for name, url in iteritems(self._custom_channels)) channels_from_multichannels = concat(channel for channel in itervalues(self.custom_multichannels)) all_channels = odict((x.name, x) for x in (ch for ch in concatv( channels_from_multichannels, custom_channels, ))) return all_channels
def _collect_repodatas_serial_as_index(use_cache, tasks): session = CondaSession() results = (fetch_repodata(url, schan, pri, use_cache=use_cache, session=session) for url, schan, pri in tasks) index = dict( concat( iteritems(result.get('packages', {})) for result in results if result)) return index
def check_whitelist(channel_urls): if context.whitelist_channels: whitelist_channel_urls = tuple(concat( Channel(c).base_urls for c in context.whitelist_channels )) for url in channel_urls: these_urls = Channel(url).base_urls if not all(this_url in whitelist_channel_urls for this_url in these_urls): bad_channel = Channel(url) raise OperationNotAllowed("Channel not included in whitelist:\n" " location: %s\n" " canonical name: %s\n" % (bad_channel.location, bad_channel.canonical_name))
def query_all(package_ref_or_match_spec, channels=None, subdirs=None): from .index import check_whitelist # TODO: fix in-line import if channels is None: channels = context.channels if subdirs is None: subdirs = context.subdirs channel_urls = all_channel_urls(channels, subdirs=subdirs) check_whitelist(channel_urls) with ThreadLimitedThreadPoolExecutor() as executor: futures = tuple(executor.submit( SubdirData(Channel(url)).query, package_ref_or_match_spec ) for url in channel_urls) return tuple(concat(future.result() for future in as_completed(futures)))
def describe_all_parameters(): builder = [] skip_categories = ('CLI-only', 'Hidden and Undocumented') for category, parameter_names in iteritems(context.category_map): if category in skip_categories: continue builder.append('# ######################################################') builder.append('# ## {:^48} ##'.format(category)) builder.append('# ######################################################') builder.append('') builder.extend(concat(parameter_description_builder(name) for name in parameter_names)) builder.append('') return '\n'.join(builder)
def _collect_repodatas_concurrent_as_index(executor, use_cache, tasks): futures = (executor.submit(fetch_repodata, url, schan, pri, use_cache=use_cache, session=CondaSession()) for url, schan, pri in tasks) results = (future.result() for future in futures) index = dict( concat( iteritems(result.get('packages', {})) for result in results if result)) return index
def prioritize_channels(channels, with_credentials=True, subdirs=None): # prioritize_channels returns and OrderedDict with platform-specific channel # urls as the key, and a tuple of canonical channel name and channel priority # number as the value # ('https://conda.anaconda.org/conda-forge/osx-64/', ('conda-forge', 1)) channels = concat((Channel(cc) for cc in c._channels) if isinstance(c, MultiChannel) else (c,) for c in (Channel(c) for c in channels)) result = odict() for priority_counter, chn in enumerate(channels): channel = Channel(chn) for url in channel.urls(with_credentials, subdirs): if url in result: continue result[url] = channel.canonical_name, min(priority_counter, MAX_CHANNEL_PRIORITY - 1) return result
def query_all(channels, subdirs, package_ref_or_match_spec): channel_urls = all_channel_urls(channels, subdirs=subdirs) result = executor = None if context.concurrent: try: from concurrent.futures import ThreadPoolExecutor, as_completed executor = ThreadPoolExecutor(10) futures = (executor.submit( SubdirData(Channel(url)).query, package_ref_or_match_spec ) for url in channel_urls) result = tuple(concat(future.result() for future in as_completed(futures))) except (ImportError, RuntimeError) as e: # concurrent.futures is only available in Python >= 3.2 or if futures is installed # RuntimeError is thrown if number of threads are limited by OS log.debug(repr(e)) if executor: executor.shutdown(wait=True) if result is None: subdir_datas = (SubdirData(Channel(url)) for url in channel_urls) result = tuple(concat(sd.query(package_ref_or_match_spec) for sd in subdir_datas)) return result
def _verify_individual_level(prefix_action_group): all_actions = concat(axngroup.actions for action_groups in prefix_action_group for axngroup in action_groups) # run all per-action verify methods # one of the more important of these checks is to verify that a file listed in # the packages manifest (i.e. info/files) is actually contained within the package for axn in all_actions: if axn.verified: continue error_result = axn.verify() if error_result: formatted_error = ''.join(format_exception_only(type(error_result), error_result)) log.debug("Verification error in action %s\n%s", axn, formatted_error) yield error_result
def describe_all_parameters(): builder = [] skip_categories = ('CLI-only', 'Hidden and Undocumented') for category, parameter_names in iteritems(context.category_map): if category in skip_categories: continue builder.append( '# ######################################################') builder.append('# ## {:^48} ##'.format(category)) builder.append( '# ######################################################') builder.append('') builder.extend( concat( parameter_description_builder(name) for name in parameter_names)) builder.append('') return '\n'.join(builder)
def query_all(channels, subdirs, package_ref_or_match_spec): channel_urls = all_channel_urls(channels, subdirs=subdirs) executor = None try: from concurrent.futures import ThreadPoolExecutor, as_completed executor = ThreadPoolExecutor(10) futures = (executor.submit( SubdirData(Channel(url)).query, package_ref_or_match_spec ) for url in channel_urls) return tuple(concat(future.result() for future in as_completed(futures))) except RuntimeError as e: # pragma: no cover # concurrent.futures is only available in Python >= 3.2 or if futures is installed # RuntimeError is thrown if number of threads are limited by OS raise finally: if executor: executor.shutdown(wait=True)
def __init__( self, dimensions: List[int], activation: torch.nn.Module = nn.ReLU(), final_activation: Optional[torch.nn.Module] = nn.ReLU(), weight_init: Callable[[torch.Tensor, torch.Tensor, float], None] = default_initialise_weight_bias_, gain: float = nn.init.calculate_gain("relu"), ): """ Autoencoder composed of a symmetric decoder and encoder components accessible via the encoder and decoder attributes. The dimensions input is the list of dimensions occurring in a single stack e.g. [100, 10, 10, 5] will make the embedding_dimension 100 and the hidden dimension 5, with the autoencoder shape [100, 10, 10, 5, 10, 10, 100]. :param dimensions: list of dimensions occurring in a single stack :param activation: activation layer to use for all but final activation, default torch.nn.ReLU :param final_activation: final activation layer to use, set to None to disable, default torch.nn.ReLU :param weight_init: function for initialising weight and bias via mutation, defaults to default_initialise_weight_bias_ :param gain: gain parameter to pass to weight_init """ super(StackedAutoEncoderModel, self).__init__() self.dimensions = dimensions self.embedding_dimension = dimensions[0] self.hidden_dimension = dimensions[-1] # construct the encoder encoder_units = build_units(self.dimensions[:-1], activation) encoder_units.extend( build_units([self.dimensions[-2], self.dimensions[-1]], None)) self.encoder = nn.Sequential(*encoder_units) # construct the decoder decoder_units = build_units(reversed(self.dimensions[1:]), activation) decoder_units.extend( build_units([self.dimensions[1], self.dimensions[0]], final_activation)) self.decoder = nn.Sequential(*decoder_units) # construct the softmax layer self.softmax_layer = nn.Linear(self.dimensions[-1], 2) # loss & optimizer self.criterion = nn.CrossEntropyLoss() self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001) # initialise the weights and biases in the layers for layer in concat([self.encoder, self.decoder]): weight_init(layer[0].weight, layer[0].bias, gain)
def merge(cls, match_specs): match_specs = tuple(cls(s) for s in match_specs if s) name_groups = groupby(attrgetter('name'), match_specs) unmergeable = name_groups.pop('*', []) + name_groups.pop(None, []) merged_specs = [] mergeable_groups = tuple( concat( itervalues(groupby(lambda s: s.optional, group)) for group in itervalues(name_groups))) for group in mergeable_groups: target_groups = groupby(attrgetter('target'), group) target_groups.pop(None, None) if len(target_groups) > 1: raise ValueError("Incompatible MatchSpec merge:%s" % dashlist(group)) merged_specs.append( reduce(lambda x, y: x._merge(y), group ) if len(group) > 1 else group[0]) return tuple(concatv(merged_specs, unmergeable))
def transform( self, doclikes: Iterable[types.DocLike]) -> Iterable[Tuple[str, ...]]: """ Convert a sequence of spaCy Docs or Spans into an ordered, nested sequence of terms as strings. Args: doclikes Yields: Ordered sequence of terms as strings for next Doc or Span. """ normalize_ = self.normalize for doclike in doclikes: terms = itertoolz.concat( tokenizer(doclike) for tokenizer in self.tokenizers) if self.dedupe is True: terms = itertoolz.unique(terms, lambda span: (span.start, span.end)) yield tuple(normalize_(term) for term in terms)
def get_ngram_candidates( doc: Doc, ns: int | Collection[int], *, include_pos: Optional[str | Collection[str]] = ("NOUN", "PROPN", "ADJ"), ) -> Iterable[Tuple[Token, ...]]: """ Get candidate keyterms from ``doc``, where candidates are n-length sequences of tokens (for all n in ``ns``) that don't start/end with a stop word or contain punctuation tokens, and whose constituent tokens are filtered by POS tag. Args: doc ns: One or more n values for which to generate n-grams. For example, ``2`` gets bigrams; ``(2, 3)`` gets bigrams and trigrams. include_pos: One or more POS tags with which to filter ngrams. If None, include tokens of all POS tags. Yields: Next ngram candidate, as a tuple of constituent Tokens. See Also: :func:`textacy.extract.ngrams()` """ ns = utils.to_collection(ns, int, tuple) include_pos = utils.to_collection(include_pos, str, set) ngrams = itertoolz.concat(itertoolz.sliding_window(n, doc) for n in ns) ngrams = ( ngram for ngram in ngrams if not (ngram[0].is_stop or ngram[-1].is_stop) and not any(word.is_punct or word.is_space for word in ngram) ) if include_pos: ngrams = ( ngram for ngram in ngrams if all(word.pos_ in include_pos for word in ngram) ) for ngram in ngrams: yield ngram
def get_random_sample(seq, n, stratify=True, random_state=None): """ Args: seq (Sequence) n (int) stratify (bool) random_state (int) Returns: list """ random.seed(a=random_state) if stratify is True: grped = itertoolz.groupby(operator.itemgetter(1), seq) n_per_grp = max(int(round(n / len(grped))), 1) sample = list( itertoolz.concat( random.sample(examples, min(len(examples), n_per_grp)) for examples in grped.values())) random.shuffle(sample) return sample[:n] else: return random.sample(seq, min(len(seq), n))
def to_terms_list(self, ngrams=(1, 2, 3), named_entities=True, lemmatize=True, lowercase=False, as_strings=False, **kwargs): """ Transform ``Doc`` into a sequence of ngrams and/or named entities, which aren't necessarily in order of appearance, where each term appears in the list with the same frequency that it appears in ``Doc``. Args: ngrams (int or Set[int]): n of which n-grams to include; ``(1, 2, 3)`` (default) includes unigrams (words), bigrams, and trigrams; `2` if only bigrams are wanted; falsy (e.g. False) to not include any named_entities (bool): if True (default), include named entities in the terms list; note: if ngrams are also included, named entities are added *first*, and any ngrams that exactly overlap with an entity are skipped to prevent double-counting lemmatize (bool): if True (default), lemmatize all terms lowercase (bool): if True and `lemmatize` is False, words are lower- cased as_strings (bool): if True, terms are returned as strings; if False (default), terms are returned as their unique integer ids kwargs: - filter_stops (bool) - filter_punct (bool) - filter_nums (bool) - include_pos (str or Set[str]) - exclude_pos (str or Set[str]) - min_freq (int) - include_types (str or Set[str]) - exclude_types (str or Set[str] - drop_determiners (bool) see :func:`extract.words <textacy.extract.words>`, :func:`extract.ngrams <textacy.extract.ngrams>`, and :func:`extract.named_entities <textacy.extract.named_entities>` for more information on these parameters Yields: int or str: the next term in the terms list, either as a unique integer id or as a string Raises: ValueError: if neither ``named_entities`` nor ``ngrams`` are included .. note:: Despite the name, this is a generator function; to get an actual list of terms, call ``list(doc.to_terms_list())``. """ if not named_entities and not ngrams: raise ValueError('either `named_entities` or `ngrams` must be included') if isinstance(ngrams, int): ngrams = (ngrams,) if named_entities is True: ne_kwargs = { 'include_types': kwargs.get('include_types'), 'exclude_types': kwargs.get('exclude_types'), 'drop_determiners': kwargs.get('drop_determiners', True), 'min_freq': kwargs.get('min_freq', 1)} if ngrams: ngram_kwargs = { 'filter_stops': kwargs.get('filter_stops', True), 'filter_punct': kwargs.get('filter_punct', True), 'filter_nums': kwargs.get('filter_nums', False), 'include_pos': kwargs.get('include_pos'), 'exclude_pos': kwargs.get('exclude_pos'), 'min_freq': kwargs.get('min_freq', 1)} terms = [] # special case: ensure that named entities aren't double-counted when # adding words or ngrams that were already added as named entities if named_entities is True and ngrams: ents = tuple(textacy.extract.named_entities(self, **ne_kwargs)) ent_idxs = {(ent.start, ent.end) for ent in ents} terms.append(ents) for n in ngrams: if n == 1: terms.append( (word for word in textacy.extract.words(self, **ngram_kwargs) if (word.idx, word.idx + 1) not in ent_idxs)) else: terms.append( (ngram for ngram in textacy.extract.ngrams(self, n, **ngram_kwargs) if (ngram.start, ngram.end) not in ent_idxs)) # otherwise, no need to check for overlaps else: if named_entities is True: terms.append(textacy.extract.named_entities(self, **ne_kwargs)) else: for n in ngrams: if n == 1: terms.append(textacy.extract.words(self, **ngram_kwargs)) else: terms.append(textacy.extract.ngrams(self, n, **ngram_kwargs)) terms = itertoolz.concat(terms) # convert token and span objects into integer ids if as_strings is False: if lemmatize is True: for term in terms: try: yield term.lemma except AttributeError: yield self.spacy_stringstore[term.lemma_] elif lowercase is True: for term in terms: try: yield term.lower except AttributeError: yield self.spacy_stringstore[term.orth_.lower()] else: for term in terms: try: yield term.orth except AttributeError: yield self.spacy_stringstore[term.orth_] # convert token and span objects into strings else: if lemmatize is True: for term in terms: yield term.lemma_ elif lowercase is True: for term in terms: try: yield term.lower_ except AttributeError: yield term.orth_.lower() else: for term in terms: yield term.orth_
def explode_directories(child_directories, already_split=False): # get all directories including parents # use already_split=True for the result of get_all_directories() maybe_split = lambda x: x if already_split else x.split('/') return set(concat(accumulate(join, maybe_split(directory)) for directory in child_directories))
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None): """ Extract key terms from a document using the [SGRank]_ algorithm. Args: doc (``spacy.Doc``) window_width (int, optional): width of sliding window in which term co-occurrences are said to occur n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms idf (dict, optional): mapping of {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency} for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1); NOTE: results are better with idf information Returns: list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their corresponding SGRank scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] References: .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117. """ if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_toks = len(doc) min_term_freq = min(n_toks // 1500, 4) # build full list of candidate terms terms = list(itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'NOUN', 'ADJ'}, min_freq=min_term_freq) for n in range(1, 7))) # if inverse document frequencies available, also add verbs # verbs without IDF downweighting dominate the results, and not in a good way if idf: terms.extend(itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'VERB'}, min_freq=min_term_freq) for n in range(1, 7))) terms_as_strs = {id(term): spacy_utils.normalized_str(term) for term in terms} # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available n_top_20pct = int(len(terms) * 0.2) term_counts = Counter(terms_as_strs[id(term)] for term in terms) if idf: mod_tfidfs = {term: count * idf[term] if ' ' not in term else count for term, count in term_counts.items()} top_term_texts = {term for term, _ in sorted( mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct]} else: top_term_texts = {term for term, _ in term_counts.most_common(n_top_20pct)} terms = [term for term in terms if terms_as_strs[id(term)] in top_term_texts] # compute term weights from statistical attributes term_weights = {} set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms} n_toks_plus_1 = n_toks + 1 for term in terms: term_str = terms_as_strs[id(term)] pos_first_occ_factor = math.log(n_toks_plus_1 / (term.start + 1)) # TODO: assess if len(t) puts too much emphasis on long terms # alternative: term_len = 1 if ' ' not in term else math.sqrt(len(term)) term_len = 1 if ' ' not in term else len(term) term_count = term_counts[term_str] subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str if t2 != term_str and term_str in t2) term_freq_factor = (term_count - subsum_count) if idf and ' ' not in term_str: term_freq_factor *= idf[term_str] term_weights[term_str] = term_freq_factor * pos_first_occ_factor * term_len # filter terms to only those with positive weights terms = [term for term in terms if term_weights[terms_as_strs[id(term)]] > 0] n_coocs = defaultdict(lambda: defaultdict(int)) sum_logdists = defaultdict(lambda: defaultdict(float)) # iterate over windows for start_ind in range(n_toks): end_ind = start_ind + window_width window_terms = (term for term in terms if start_ind <= term.start <= end_ind) # get all token combinations within window for t1, t2 in itertools.combinations(window_terms, 2): if t1 is t2: continue n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1 try: sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ math.log(window_width / abs(t1.start - t2.start)) except ZeroDivisionError: # HACK: pretend that they're 1 token apart sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ math.log(window_width) if end_ind > n_toks: break # compute edge weights between co-occurring terms (nodes) edge_weights = defaultdict(lambda: defaultdict(float)) for t1, t2s in sum_logdists.items(): for t2 in t2s: edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]) * term_weights[t1] * term_weights[t2] # normalize edge weights by sum of outgoing edge weights per term (node) norm_edge_weights = [] for t1, t2s in edge_weights.items(): sum_edge_weights = sum(t2s.values()) norm_edge_weights.extend((t1, t2, {'weight': weight / sum_edge_weights}) for t2, weight in t2s.items()) # build the weighted directed graph from edges, rank nodes by pagerank graph = nx.DiGraph() graph.add_edges_from(norm_edge_weights) term_ranks = nx.pagerank_scipy(graph) if isinstance(n_keyterms, float): n_keyterms = int(len(term_ranks) * n_keyterms) return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]
def query_all(cls, package_ref_or_match_spec, pkgs_dirs=None): if pkgs_dirs is None: pkgs_dirs = context.pkgs_dirs return concat(pcache.query(package_ref_or_match_spec) for pcache in cls.all_caches_writable_first(pkgs_dirs))
def get_all_extracted_entries(cls): package_caches = (cls(pd) for pd in context.pkgs_dirs) return tuple(pc_entry for pc_entry in concat(map(itervalues, package_caches)) if pc_entry.is_extracted)
def as_terms_list(self, words=True, ngrams=(2, 3), named_entities=True, dedupe=True, lemmatize=True, **kwargs): """ Represent doc as a sequence of terms -- which aren't necessarily in order -- including words (unigrams), ngrams (for a range of n), and named entities. NOTE: Despite the name, this is a generator function; to get a *list* of terms, just wrap the call like ``list(doc.as_terms_list())``. Args: words (bool, optional): if True (default), include words in the terms list ngrams (tuple(int), optional): include a range of ngrams in the terms list; default is ``(2, 3)``, i.e. bigrams and trigrams are included; if ngrams aren't wanted, set to False-y NOTE: if n=1 (words) is included here and ``words`` is True, n=1 is skipped named_entities (bool, optional): if True (default), include named entities in the terms list dedupe (bool, optional): if True (default), named entities are added first to the terms list, and any words or ngrams that exactly overlap with previously added entities are skipped to prevent double-counting; since words and ngrams (n > 1) are inherently exclusive, this only applies to entities; you almost certainly want this to be True lemmatize (bool, optional): if True (default), lemmatize all terms; otherwise, return the text as it appeared kwargs: filter_stops (bool) filter_punct (bool) filter_nums (bool) good_pos_tags (set(str)) bad_pos_tags (set(str)) min_freq (int) good_ne_types (set(str)) bad_ne_types (set(str)) drop_determiners (bool) Yields: str: the next term in the terms list """ all_terms = [] # special case: ensure that named entities aren't double-counted when # adding words or ngrams that were already added as named entities if dedupe is True and named_entities is True and (words is True or ngrams): ents = list(self.named_entities(**kwargs)) ent_idxs = {(ent.start, ent.end) for ent in ents} all_terms.append(ents) if words is True: all_terms.append((word for word in self.words(**kwargs) if (word.idx, word.idx + 1) not in ent_idxs)) if ngrams: for n in range(ngrams[0], ngrams[1] + 1): if n == 1 and words is True: continue all_terms.append((ngram for ngram in self.ngrams(n, **kwargs) if (ngram.start, ngram.end) not in ent_idxs)) # otherwise add everything in, duplicates and all else: if named_entities is True: all_terms.append(self.named_entities(**kwargs)) if words is True: all_terms.append(self.words(**kwargs)) if ngrams: for n in range(ngrams[0], ngrams[1] + 1): if n == 1 and words is True: continue all_terms.append(self.ngrams(n, **kwargs)) if lemmatize is True: for term in itertoolz.concat(all_terms): yield term.lemma_ else: for term in itertoolz.concat(all_terms): yield term.text
def make_actions_for_record(pref_or_spec): assert pref_or_spec is not None # returns a cache_action and extract_action # if the pref or spec has an md5 value # look in all caches for package cache record that is # (1) already extracted, and # (2) matches the md5 # If one exists, no actions are needed. md5 = pref_or_spec.get('md5') if md5: extracted_pcrec = next(( pcrec for pcrec in concat(PackageCacheData(pkgs_dir).query(pref_or_spec) for pkgs_dir in context.pkgs_dirs) if pcrec.is_extracted ), None) if extracted_pcrec: return None, None # there is no extracted dist that can work, so now we look for tarballs that # aren't extracted # first we look in all writable caches, and if we find a match, we extract in place # otherwise, if we find a match in a non-writable cache, we link it to the first writable # cache, and then extract first_writable_cache = PackageCacheData.first_writable() pcrec_from_writable_cache = next(( pcrec for pcrec in concat(pcache.query(pref_or_spec) for pcache in PackageCacheData.writable_caches()) if pcrec.is_fetched ), None) if pcrec_from_writable_cache: # extract in place extract_axn = ExtractPackageAction( source_full_path=pcrec_from_writable_cache.package_tarball_full_path, target_pkgs_dir=dirname(pcrec_from_writable_cache.package_tarball_full_path), target_extracted_dirname=basename(pcrec_from_writable_cache.extracted_package_dir), record_or_spec=pcrec_from_writable_cache, md5sum=pcrec_from_writable_cache.md5, ) return None, extract_axn pcrec_from_read_only_cache = next(( pcrec for pcrec in concat(pcache.query(pref_or_spec) for pcache in PackageCacheData.read_only_caches()) if pcrec.is_fetched ), None) if pcrec_from_read_only_cache: # we found a tarball, but it's in a read-only package cache # we need to link the tarball into the first writable package cache, # and then extract try: expected_size_in_bytes = pref_or_spec.size except AttributeError: expected_size_in_bytes = None cache_axn = CacheUrlAction( url=path_to_url(pcrec_from_read_only_cache.package_tarball_full_path), target_pkgs_dir=first_writable_cache.pkgs_dir, target_package_basename=pcrec_from_read_only_cache.fn, md5sum=md5, expected_size_in_bytes=expected_size_in_bytes, ) trgt_extracted_dirname = pcrec_from_read_only_cache.fn[:-len(CONDA_TARBALL_EXTENSION)] extract_axn = ExtractPackageAction( source_full_path=cache_axn.target_full_path, target_pkgs_dir=first_writable_cache.pkgs_dir, target_extracted_dirname=trgt_extracted_dirname, record_or_spec=pcrec_from_read_only_cache, md5sum=pcrec_from_read_only_cache.md5, ) return cache_axn, extract_axn # if we got here, we couldn't find a matching package in the caches # we'll have to download one; fetch and extract url = pref_or_spec.get('url') assert url try: expected_size_in_bytes = pref_or_spec.size except AttributeError: expected_size_in_bytes = None cache_axn = CacheUrlAction( url=url, target_pkgs_dir=first_writable_cache.pkgs_dir, target_package_basename=pref_or_spec.fn, md5sum=md5, expected_size_in_bytes=expected_size_in_bytes, ) extract_axn = ExtractPackageAction( source_full_path=cache_axn.target_full_path, target_pkgs_dir=first_writable_cache.pkgs_dir, target_extracted_dirname=pref_or_spec.fn[:-len(CONDA_TARBALL_EXTENSION)], record_or_spec=pref_or_spec, md5sum=md5, ) return cache_axn, extract_axn
def test_concat(): assert list(concat([[], [], []])) == [] assert (list(take(5, concat([['a', 'b'], range(1000000000)]))) == ['a', 'b', 0, 1, 2])
def execute_config(args, parser): json_warnings = [] json_get = {} if args.show_sources: if context.json: print(json.dumps(context.collect_all(), sort_keys=True, indent=2, separators=(',', ': '))) else: lines = [] for source, reprs in iteritems(context.collect_all()): lines.append("==> %s <==" % source) lines.extend(format_dict(reprs)) lines.append('') print('\n'.join(lines)) return if args.show is not None: if args.show: paramater_names = args.show all_names = context.list_parameters() not_params = set(paramater_names) - set(all_names) if not_params: from ..exceptions import ArgumentError from ..resolve import dashlist raise ArgumentError("Invalid configuration parameters: %s" % dashlist(not_params)) else: paramater_names = context.list_parameters() from collections import OrderedDict d = OrderedDict((key, getattr(context, key)) for key in paramater_names) if context.json: print(json.dumps(d, sort_keys=True, indent=2, separators=(',', ': '), cls=EntityEncoder)) else: # Add in custom formatting if 'custom_channels' in d: d['custom_channels'] = { channel.name: "%s://%s" % (channel.scheme, channel.location) for channel in itervalues(d['custom_channels']) } if 'custom_multichannels' in d: from ..resolve import dashlist d['custom_multichannels'] = { multichannel_name: dashlist(channels, indent=4) for multichannel_name, channels in iteritems(d['custom_multichannels']) } print('\n'.join(format_dict(d))) context.validate_configuration() return if args.describe is not None: if args.describe: paramater_names = args.describe all_names = context.list_parameters() not_params = set(paramater_names) - set(all_names) if not_params: from ..exceptions import ArgumentError from ..resolve import dashlist raise ArgumentError("Invalid configuration parameters: %s" % dashlist(not_params)) if context.json: print(json.dumps([context.describe_parameter(name) for name in paramater_names], sort_keys=True, indent=2, separators=(',', ': '), cls=EntityEncoder)) else: builder = [] builder.extend(concat(parameter_description_builder(name) for name in paramater_names)) print('\n'.join(builder)) else: if context.json: skip_categories = ('CLI-only', 'Hidden and Undocumented') paramater_names = sorted(concat( parameter_names for category, parameter_names in context.category_map.items() if category not in skip_categories )) print(json.dumps([context.describe_parameter(name) for name in paramater_names], sort_keys=True, indent=2, separators=(',', ': '), cls=EntityEncoder)) else: print(describe_all_parameters()) return if args.validate: context.validate_all() return if args.system: rc_path = sys_rc_path elif args.env: if 'CONDA_PREFIX' in os.environ: rc_path = join(os.environ['CONDA_PREFIX'], '.condarc') else: rc_path = user_rc_path elif args.file: rc_path = args.file else: rc_path = user_rc_path if args.write_default: if isfile(rc_path): with open(rc_path) as fh: data = fh.read().strip() if data: raise CondaError("The file '%s' " "already contains configuration information.\n" "Remove the file to proceed.\n" "Use `conda config --describe` to display default configuration." % rc_path) with open(rc_path, 'w') as fh: fh.write(describe_all_parameters()) return # read existing condarc if os.path.exists(rc_path): with open(rc_path, 'r') as fh: rc_config = yaml_load(fh) or {} else: rc_config = {} grouped_paramaters = groupby(lambda p: context.describe_parameter(p)['parameter_type'], context.list_parameters()) primitive_parameters = grouped_paramaters['primitive'] sequence_parameters = grouped_paramaters['sequence'] map_parameters = grouped_paramaters['map'] # Get if args.get is not None: context.validate_all() if args.get == []: args.get = sorted(rc_config.keys()) for key in args.get: if key not in primitive_parameters + sequence_parameters: message = "unknown key %s" % key if not context.json: print(message, file=sys.stderr) else: json_warnings.append(message) continue if key not in rc_config: continue if context.json: json_get[key] = rc_config[key] continue if isinstance(rc_config[key], (bool, string_types)): print("--set", key, rc_config[key]) else: # assume the key is a list-type # Note, since conda config --add prepends, these are printed in # the reverse order so that entering them in this order will # recreate the same file items = rc_config.get(key, []) numitems = len(items) for q, item in enumerate(reversed(items)): # Use repr so that it can be pasted back in to conda config --add if key == "channels" and q in (0, numitems-1): print("--add", key, repr(item), " # lowest priority" if q == 0 else " # highest priority") else: print("--add", key, repr(item)) if args.stdin: content = timeout(5, sys.stdin.read) if not content: return try: parsed = yaml_load(content) rc_config.update(parsed) except Exception: # pragma: no cover from ..exceptions import ParseError raise ParseError("invalid yaml content:\n%s" % content) # prepend, append, add for arg, prepend in zip((args.prepend, args.append), (True, False)): for key, item in arg: if key == 'channels' and key not in rc_config: rc_config[key] = ['defaults'] if key not in sequence_parameters: from ..exceptions import CondaValueError raise CondaValueError("Key '%s' is not a known sequence parameter." % key) if not isinstance(rc_config.get(key, []), list): from ..exceptions import CouldntParseError bad = rc_config[key].__class__.__name__ raise CouldntParseError("key %r should be a list, not %s." % (key, bad)) arglist = rc_config.setdefault(key, []) if item in arglist: # Right now, all list keys should not contain duplicates message = "Warning: '%s' already in '%s' list, moving to the %s" % ( item, key, "top" if prepend else "bottom") arglist = rc_config[key] = [p for p in arglist if p != item] if not context.json: print(message, file=sys.stderr) else: json_warnings.append(message) arglist.insert(0 if prepend else len(arglist), item) # Set for key, item in args.set: key, subkey = key.split('.', 1) if '.' in key else (key, None) if key in primitive_parameters: value = context.typify_parameter(key, item) rc_config[key] = value elif key in map_parameters: argmap = rc_config.setdefault(key, {}) argmap[subkey] = item else: from ..exceptions import CondaValueError raise CondaValueError("Key '%s' is not a known primitive parameter." % key) # Remove for key, item in args.remove: key, subkey = key.split('.', 1) if '.' in key else (key, None) if key not in rc_config: if key != 'channels': from ..exceptions import CondaKeyError raise CondaKeyError(key, "key %r is not in the config file" % key) rc_config[key] = ['defaults'] if item not in rc_config[key]: from ..exceptions import CondaKeyError raise CondaKeyError(key, "%r is not in the %r key of the config file" % (item, key)) rc_config[key] = [i for i in rc_config[key] if i != item] # Remove Key for key, in args.remove_key: key, subkey = key.split('.', 1) if '.' in key else (key, None) if key not in rc_config: from ..exceptions import CondaKeyError raise CondaKeyError(key, "key %r is not in the config file" % key) del rc_config[key] # config.rc_keys if not args.get: # Add representers for enums. # Because a representer cannot be added for the base Enum class (it must be added for # each specific Enum subclass), and because of import rules), I don't know of a better # location to do this. def enum_representer(dumper, data): return dumper.represent_str(str(data)) yaml.representer.RoundTripRepresenter.add_representer(SafetyChecks, enum_representer) yaml.representer.RoundTripRepresenter.add_representer(PathConflict, enum_representer) try: with open(rc_path, 'w') as rc: rc.write(yaml_dump(rc_config)) except (IOError, OSError) as e: raise CondaError('Cannot write to condarc file at %s\n' 'Caused by %r' % (rc_path, e)) if context.json: from .common import stdout_json_success stdout_json_success( rc_path=rc_path, warnings=json_warnings, get=json_get ) return
def direct_quotations(doc): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Args: doc (``spacy.Doc``) Yields: (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc`` represented as a (speaker, reporting verb, quotation) 3-tuple Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". TODO: Better approach would use ML, but needs a training dataset. """ quote_end_punct = {',', '.', '?', '!'} quote_indexes = set(itertoolz.concat( (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string))) quote_positions = list(itertoolz.partition( 2, sorted(tok.i for tok in doc if tok.idx in quote_indexes))) sents = list(doc.sents) sent_positions = [(sent.start, sent.end) for sent in sents] for q0, q1 in quote_positions: quote = doc[q0: q1 + 1] # we're only looking for direct quotes, not indirect or mixed if not any(char in quote_end_punct for char in quote.text[-4:]): continue # get adjacent sentences candidate_sent_indexes = [] for i, (s0, s1) in enumerate(sent_positions): if s0 <= q1 + 1 and s1 > q1: candidate_sent_indexes.append(i) elif s0 < q0 and s1 >= q0 - 1: candidate_sent_indexes.append(i) for si in candidate_sent_indexes: sent = sents[si] # get any reporting verbs rvs = [tok for tok in sent if spacy_utils.preserve_case(tok) is False and tok.lemma_ in REPORTING_VERBS and tok.pos_ == 'VERB' and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)] # get target offset against which to measure distances of NEs if rvs: if len(rvs) == 1: rv = rvs[0] else: min_rv_dist = 1000 for rv_candidate in rvs: rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1)) if rv_dist < min_rv_dist: rv = rv_candidate min_rv_dist = rv_dist else: break else: # TODO: do we have no other recourse?! continue try: # rv_subj = _find_subjects(rv)[0] rv_subj = get_subjects_of_verb(rv)[0] except IndexError: continue # if rv_subj.text in {'he', 'she'}: # for ne in named_entities(doc, good_ne_types={'PERSON'}): # if ne.start < rv_subj.i: # speaker = ne # else: # break # else: span = get_span_for_compound_noun(rv_subj) speaker = doc[span[0]: span[1] + 1] yield (speaker, rv, quote) break
def solve_final_state(self, update_modifier=NULL, deps_modifier=NULL, prune=NULL, ignore_pinned=NULL, force_remove=NULL): """Gives the final, solved state of the environment. Args: update_modifier (UpdateModifier): An optional flag directing how updates are handled regarding packages already existing in the environment. deps_modifier (DepsModifier): An optional flag indicating special solver handling for dependencies. The default solver behavior is to be as conservative as possible with dependency updates (in the case the dependency already exists in the environment), while still ensuring all dependencies are satisfied. Options include * NO_DEPS * ONLY_DEPS * UPDATE_DEPS * UPDATE_DEPS_ONLY_DEPS * FREEZE_INSTALLED prune (bool): If ``True``, the solution will not contain packages that were previously brought into the environment as dependencies but are no longer required as dependencies and are not user-requested. ignore_pinned (bool): If ``True``, the solution will ignore pinned package configuration for the prefix. force_remove (bool): Forces removal of a package without removing packages that depend on it. Returns: Tuple[PackageRef]: In sorted dependency order from roots to leaves, the package references for the solved state of the environment. """ if update_modifier is NULL: update_modifier = context.update_modifier else: update_modifier = UpdateModifier(text_type(update_modifier).lower()) if deps_modifier is NULL: deps_modifier = context.deps_modifier else: deps_modifier = DepsModifier(text_type(deps_modifier).lower()) prune = context.prune if prune is NULL else prune ignore_pinned = context.ignore_pinned if ignore_pinned is NULL else ignore_pinned force_remove = context.force_remove if force_remove is NULL else force_remove specs_to_remove = self.specs_to_remove specs_to_add = self.specs_to_add # force_remove is a special case where we return early if specs_to_remove and force_remove: if specs_to_add: raise NotImplementedError() solution = tuple(prec for prec in PrefixData(self.prefix).iter_records() if not any(spec.match(prec) for spec in specs_to_remove)) return IndexedSet(PrefixGraph(solution).graph) log.debug("solving prefix %s\n" " specs_to_remove: %s\n" " specs_to_add: %s\n" " prune: %s", self.prefix, specs_to_remove, specs_to_add, prune) # declare starting point, the initial state of the environment # `solution` and `specs_map` are mutated throughout this method prefix_data = PrefixData(self.prefix) solution = tuple(prec for prec in prefix_data.iter_records()) # Check if specs are satisfied by current environment. If they are, exit early. if (update_modifier == UpdateModifier.SPECS_SATISFIED_SKIP_SOLVE and not specs_to_remove and not prune): for spec in specs_to_add: if not next(prefix_data.query(spec), None): break else: # All specs match a package in the current environment. # Return early, with a solution that should just be PrefixData().iter_records() return IndexedSet(PrefixGraph(solution).graph) specs_from_history_map = History(self.prefix).get_requested_specs_map() if prune: # or update_modifier == UpdateModifier.UPDATE_ALL # pending conda/constructor#138 # NOQA # Users are struggling with the prune functionality in --update-all, due to # https://github.com/conda/constructor/issues/138. Until that issue is resolved, # and for the foreseeable future, it's best to be more conservative with --update-all. # Start with empty specs map for UPDATE_ALL because we're optimizing the update # only for specs the user has requested; it's ok to remove dependencies. specs_map = odict() # However, because of https://github.com/conda/constructor/issues/138, we need # to hard-code keeping conda, conda-build, and anaconda, if they're already in # the environment. solution_pkg_names = set(d.name for d in solution) ensure_these = (pkg_name for pkg_name in { 'anaconda', 'conda', 'conda-build', } if pkg_name not in specs_from_history_map and pkg_name in solution_pkg_names) for pkg_name in ensure_these: specs_from_history_map[pkg_name] = MatchSpec(pkg_name) else: specs_map = odict((d.name, MatchSpec(d.name)) for d in solution) # add in historically-requested specs specs_map.update(specs_from_history_map) # let's pretend for now that this is the right place to build the index prepared_specs = set(concatv( specs_to_remove, specs_to_add, itervalues(specs_from_history_map), )) index, r = self._prepare(prepared_specs) if specs_to_remove: # In a previous implementation, we invoked SAT here via `r.remove()` to help with # spec removal, and then later invoking SAT again via `r.solve()`. Rather than invoking # SAT for spec removal determination, we can use the PrefixGraph and simple tree # traversal if we're careful about how we handle features. We still invoke sat via # `r.solve()` later. _track_fts_specs = (spec for spec in specs_to_remove if 'track_features' in spec) feature_names = set(concat(spec.get_raw_value('track_features') for spec in _track_fts_specs)) graph = PrefixGraph(solution, itervalues(specs_map)) all_removed_records = [] no_removed_records_specs = [] for spec in specs_to_remove: # If the spec was a track_features spec, then we need to also remove every # package with a feature that matches the track_feature. The # `graph.remove_spec()` method handles that for us. log.trace("using PrefixGraph to remove records for %s", spec) removed_records = graph.remove_spec(spec) if removed_records: all_removed_records.extend(removed_records) else: no_removed_records_specs.append(spec) # ensure that each spec in specs_to_remove is actually associated with removed records unmatched_specs_to_remove = tuple( spec for spec in no_removed_records_specs if not any(spec.match(rec) for rec in all_removed_records) ) if unmatched_specs_to_remove: raise PackagesNotFoundError( tuple(sorted(str(s) for s in unmatched_specs_to_remove)) ) for rec in all_removed_records: # We keep specs (minus the feature part) for the non provides_features packages # if they're in the history specs. Otherwise, we pop them from the specs_map. rec_has_a_feature = set(rec.features or ()) & feature_names if rec_has_a_feature and rec.name in specs_from_history_map: spec = specs_map.get(rec.name, MatchSpec(rec.name)) spec._match_components.pop('features', None) specs_map[spec.name] = spec else: specs_map.pop(rec.name, None) solution = tuple(graph.graph) # We handle as best as possible environments in inconsistent states. To do this, # we remove now from consideration the set of packages causing inconsistencies, # and then we add them back in following the main SAT call. _, inconsistent_precs = r.bad_installed(solution, ()) add_back_map = {} # name: (prec, spec) if log.isEnabledFor(DEBUG): log.debug("inconsistent precs: %s", dashlist(inconsistent_precs) if inconsistent_precs else 'None') if inconsistent_precs: for prec in inconsistent_precs: # pop and save matching spec in specs_map add_back_map[prec.name] = (prec, specs_map.pop(prec.name, None)) solution = tuple(prec for prec in solution if prec not in inconsistent_precs) # For the remaining specs in specs_map, add target to each spec. `target` is a reference # to the package currently existing in the environment. Setting target instructs the # solver to not disturb that package if it's not necessary. # If the spec.name is being modified by inclusion in specs_to_add, we don't set `target`, # since we *want* the solver to modify/update that package. # # TLDR: when working with MatchSpec objects, # - to minimize the version change, set MatchSpec(name=name, target=prec.dist_str()) # - to freeze the package, set all the components of MatchSpec individually for pkg_name, spec in iteritems(specs_map): matches_for_spec = tuple(prec for prec in solution if spec.match(prec)) if matches_for_spec: if len(matches_for_spec) != 1: raise CondaError(dals(""" Conda encountered an error with your environment. Please report an issue at https://github.com/conda/conda/issues/new. In your report, please include the output of 'conda info' and 'conda list' for the active environment, along with the command you invoked that resulted in this error. pkg_name: %s spec: %s matches_for_spec: %s """) % (pkg_name, spec, dashlist((text_type(s) for s in matches_for_spec), indent=4))) target_prec = matches_for_spec[0] if update_modifier == UpdateModifier.FREEZE_INSTALLED: new_spec = MatchSpec(target_prec) else: target = target_prec.dist_str() new_spec = MatchSpec(spec, target=target) specs_map[pkg_name] = new_spec if log.isEnabledFor(TRACE): log.trace("specs_map with targets: %s", specs_map) # If we're in UPDATE_ALL mode, we need to drop all the constraints attached to specs, # so they can all float and the solver can find the most up-to-date solution. In the case # of UPDATE_ALL, `specs_map` wasn't initialized with packages from the current environment, # but *only* historically-requested specs. This lets UPDATE_ALL drop dependencies if # they're no longer needed, and their presence would otherwise prevent the updated solution # the user most likely wants. if update_modifier == UpdateModifier.UPDATE_ALL: specs_map = {pkg_name: MatchSpec(spec.name, optional=spec.optional) for pkg_name, spec in iteritems(specs_map)} # As a business rule, we never want to update python beyond the current minor version, # unless that's requested explicitly by the user (which we actively discourage). if 'python' in specs_map: python_prefix_rec = prefix_data.get('python') if python_prefix_rec: python_spec = specs_map['python'] if not python_spec.get('version'): pinned_version = get_major_minor_version(python_prefix_rec.version) + '.*' specs_map['python'] = MatchSpec(python_spec, version=pinned_version) # For the aggressive_update_packages configuration parameter, we strip any target # that's been set. if not context.offline: for spec in context.aggressive_update_packages: if spec.name in specs_map: specs_map[spec.name] = spec if (context.auto_update_conda and paths_equal(self.prefix, context.root_prefix) and any(prec.name == "conda" for prec in solution)): specs_map["conda"] = MatchSpec("conda") # add in explicitly requested specs from specs_to_add # this overrides any name-matching spec already in the spec map specs_map.update((s.name, s) for s in specs_to_add) # collect additional specs to add to the solution track_features_specs = pinned_specs = () if context.track_features: track_features_specs = tuple(MatchSpec(x + '@') for x in context.track_features) if not ignore_pinned: pinned_specs = get_pinned_specs(self.prefix) final_environment_specs = IndexedSet(concatv( itervalues(specs_map), track_features_specs, pinned_specs, )) # We've previously checked `solution` for consistency (which at that point was the # pre-solve state of the environment). Now we check our compiled set of # `final_environment_specs` for the possibility of a solution. If there are conflicts, # we can often avoid them by neutering specs that have a target (e.g. removing version # constraint) and also making them optional. The result here will be less cases of # `UnsatisfiableError` handed to users, at the cost of more packages being modified # or removed from the environment. conflicting_specs = r.get_conflicting_specs(tuple(final_environment_specs)) if log.isEnabledFor(DEBUG): log.debug("conflicting specs: %s", dashlist(conflicting_specs)) for spec in conflicting_specs: if spec.target: final_environment_specs.remove(spec) neutered_spec = MatchSpec(spec.name, target=spec.target, optional=True) final_environment_specs.add(neutered_spec) # Finally! We get to call SAT. if log.isEnabledFor(DEBUG): log.debug("final specs to add: %s", dashlist(sorted(text_type(s) for s in final_environment_specs))) solution = r.solve(tuple(final_environment_specs)) # return value is List[PackageRecord] # add back inconsistent packages to solution if add_back_map: for name, (prec, spec) in iteritems(add_back_map): if not any(d.name == name for d in solution): solution.append(prec) if spec: final_environment_specs.add(spec) # Special case handling for various DepsModifier flags. Maybe this block could be pulled # out into its own non-public helper method? if deps_modifier == DepsModifier.NO_DEPS: # In the NO_DEPS case, we need to start with the original list of packages in the # environment, and then only modify packages that match specs_to_add or # specs_to_remove. _no_deps_solution = IndexedSet(prefix_data.iter_records()) only_remove_these = set(prec for spec in specs_to_remove for prec in _no_deps_solution if spec.match(prec)) _no_deps_solution -= only_remove_these only_add_these = set(prec for spec in specs_to_add for prec in solution if spec.match(prec)) remove_before_adding_back = set(prec.name for prec in only_add_these) _no_deps_solution = IndexedSet(prec for prec in _no_deps_solution if prec.name not in remove_before_adding_back) _no_deps_solution |= only_add_these solution = _no_deps_solution elif (deps_modifier == DepsModifier.ONLY_DEPS and update_modifier != UpdateModifier.UPDATE_DEPS): # Using a special instance of PrefixGraph to remove youngest child nodes that match # the original specs_to_add. It's important to remove only the *youngest* child nodes, # because a typical use might be `conda install --only-deps python=2 flask`, and in # that case we'd want to keep python. graph = PrefixGraph(solution, specs_to_add) graph.remove_youngest_descendant_nodes_with_specs() solution = tuple(graph.graph) elif update_modifier == UpdateModifier.UPDATE_DEPS: # Here we have to SAT solve again :( It's only now that we know the dependency # chain of specs_to_add. specs_to_add_names = set(spec.name for spec in specs_to_add) update_names = set() graph = PrefixGraph(solution, final_environment_specs) for spec in specs_to_add: node = graph.get_node_by_name(spec.name) for ancestor_record in graph.all_ancestors(node): ancestor_name = ancestor_record.name if ancestor_name not in specs_to_add_names: update_names.add(ancestor_name) grouped_specs = groupby(lambda s: s.name in update_names, final_environment_specs) new_final_environment_specs = set(grouped_specs.get(False, ())) update_specs = set(MatchSpec(spec.name, optional=spec.optional) for spec in grouped_specs.get(True, ())) final_environment_specs = new_final_environment_specs | update_specs solution = r.solve(final_environment_specs) if deps_modifier == DepsModifier.ONLY_DEPS: # duplicated from DepsModifier.ONLY_DEPS graph = PrefixGraph(solution, specs_to_add) graph.remove_youngest_descendant_nodes_with_specs() solution = tuple(graph.graph) if prune: graph = PrefixGraph(solution, final_environment_specs) graph.prune() solution = tuple(graph.graph) self._check_solution(solution, pinned_specs) solution = IndexedSet(PrefixGraph(solution).graph) log.debug("solved prefix %s\n" " solved_linked_dists:\n" " %s\n", self.prefix, "\n ".join(prec.dist_str() for prec in solution)) return solution
def query_all(spec): futures = tuple(executor.submit(sd.query, spec) for sd in subdir_datas) return tuple(concat(future.result() for future in as_completed(futures)))
def iterations(self) -> int: return len(list(concat(self.epoch)))
def test_concat(): assert list(concat([[], [], []])) == [] assert list(take(5, concat([["a", "b"], range(1000000000)]))) == ["a", "b", 0, 1, 2]