Exemple #1
0
def test_peek():
    alist = ["Alice", "Bob", "Carol"]
    element, blist = peek(alist)
    element == alist[0]
    assert list(blist) == alist

    assert raises(StopIteration, lambda: peek([]))
Exemple #2
0
def test_peek():
    alist = ["Alice", "Bob", "Carol"]
    element, blist = peek(alist)
    element == alist[0]
    assert list(blist) == alist

    assert raises(StopIteration, lambda: peek([]))
Exemple #3
0
    def load(cls, filepath):
        """
        Load documents' pickled content and metadata from disk, and initialize
        a :class:`Corpus` with a spacy language pipeline equivalent to what was
        in use previously, when the corpus was saved.

        Args:
            filepath (str): Full path to file on disk where documents' content and
                metadata are saved.

        Returns:
            :class:`Corpus`

        See Also:
            :meth:`Corpus.save()`
        """
        spacy_docs = io.read_spacy_docs(filepath)
        # HACK: pop spacy language metadata from first doc's user_data
        # so we can (more or less...) re-instantiate the same language pipeline
        first_spacy_doc, spacy_docs = itertoolz.peek(spacy_docs)
        spacy_lang_meta = first_spacy_doc.user_data['textacy'].pop(
            'spacy_lang_meta')
        # manually instantiate the spacy language pipeline and
        # hope that the spacy folks either make this easier or don't touch it
        spacy_lang = get_lang_class(spacy_lang_meta['lang'])(
            vocab=first_spacy_doc.vocab, meta=spacy_lang_meta)
        for name in spacy_lang_meta['pipeline']:
            spacy_lang.add_pipe(spacy_lang.create_pipe(name))
        return cls(spacy_lang, docs=spacy_docs)
    def add(
        self,
        data: CorpusData,
        batch_size: int = 1000,
        n_process: int = 1,
    ) -> None:
        """
        Add one or a stream of texts, records, or :class:`spacy.tokens.Doc` s
        to the corpus, ensuring that all processing is or has already been done
        by the :attr:`Corpus.spacy_lang` pipeline.

        Args:
            data
            batch_size: Number of texts to buffer when processing with spaCy.
            n_process: Number of parallel processors to run when processing.
                If -1, this is set to ``multiprocessing.cpu_count()``.

                .. note:: This feature is only available in spaCy 2.2.2+, and only applies
                   when ``data`` is a sequence of texts or records.

        See Also:
            * :meth:`Corpus.add_text()`
            * :meth:`Corpus.add_texts()`
            * :meth:`Corpus.add_record()`
            * :meth:`Corpus.add_records()`
            * :meth:`Corpus.add_doc()`
            * :meth:`Corpus.add_docs()`
        """
        if isinstance(data, str):
            self.add_text(data)
        elif isinstance(data, spacy.tokens.Doc):
            self.add_doc(data)
        elif utils.is_record(data):
            self.add_record(data)
        elif isinstance(data, collections.abc.Iterable):
            first, data = itertoolz.peek(data)
            if isinstance(first, str):
                self.add_texts(data,
                               batch_size=batch_size,
                               n_process=n_process)
            elif isinstance(first, spacy.tokens.Doc):
                self.add_docs(data)
            elif utils.is_record(first):
                self.add_records(data,
                                 batch_size=batch_size,
                                 n_process=n_process)
            else:
                raise TypeError(
                    "data must be one of {} or an interable thereof, not {}".
                    format(
                        {str, spacy.tokens.Doc, tuple},
                        type(data),
                    ))
        else:
            raise TypeError(
                "data must be one of {} or an interable thereof, not {}".
                format(
                    {str, spacy.tokens.Doc, tuple},
                    type(data),
                ))
Exemple #5
0
    def add(self, data, batch_size=1000):
        """
        Add one or a stream of texts, records, or :class:`spacy.tokens.Doc` s
        to the corpus, ensuring that all processing is or has already been done
        by the :attr:`Corpus.spacy_lang` pipeline.

        Args:
            data (obj or Iterable[obj]):
                str or Iterable[str]
                Tuple[str, dict] or Iterable[Tuple[str, dict]]
                :class:`spacy.tokens.Doc` or Iterable[:class:`spacy.tokens.Doc`]
            batch_size (int)

        See Also:
            * :meth:`Corpus.add_text()`
            * :meth:`Corpus.add_texts()`
            * :meth:`Corpus.add_record()`
            * :meth:`Corpus.add_records()`
            * :meth:`Corpus.add_doc()`
            * :meth:`Corpus.add_docs()`
        """
        if isinstance(data, compat.unicode_):
            self.add_text(data)
        elif isinstance(data, spacy.tokens.Doc):
            self.add_doc(data)
        elif utils.is_record(data):
            self.add_record(data)
        elif isinstance(data, compat.Iterable):
            first, data = itertoolz.peek(data)
            if isinstance(first, compat.unicode_):
                self.add_texts(data, batch_size=batch_size)
            elif isinstance(first, spacy.tokens.Doc):
                self.add_docs(data)
            elif utils.is_record(first):
                self.add_records(data, batch_size=batch_size)
            else:
                raise TypeError(
                    "data must be one of {} or an interable thereof, not {}".
                    format(
                        {compat.unicode_, spacy.tokens.Doc, tuple},
                        type(data),
                    ))
        else:
            raise TypeError(
                "data must be one of {} or an interable thereof, not {}".
                format(
                    {compat.unicode_, spacy.tokens.Doc, tuple},
                    type(data),
                ))
Exemple #6
0
def empty(it: Iterable[E]) -> Tuple[bool, Iterable[E]]:
    """
    Checks, whether the sequence is empty or not.

    NOTE: This method modifies the original sequence (takes the first element),
    use the returned one, which contains the original items.

    >>> it_orig = iter([1, 2, 3])
    >>> is_empty, it_new = empty(it_orig)
    >>> is_empty, list(it_new)
    (False, [1, 2, 3])

    >>> is_empty, it_empty = empty(iter([]))
    >>> is_empty, list(it_empty)
    (True, [])
    """
    try:
        _, it = peek(it)
        return False, it
    except StopIteration:
        return True, iter([])
Exemple #7
0
def head_tail(it: Iterable[E]) -> Tuple[E, Iterable[E]]:
    """
    Split provided iterable into head element and tail iterable.

    >>> head, tail = head_tail(iter([1, 2, 3]))
    >>> head, list(tail)
    (1, [2, 3])

    >>> head, tail = head_tail(iter([42]))
    >>> head, list(tail)
    (42, [])

    Raises :class:`StopIteration` if the original iterable is empty.

    >>> head_tail(iter([]))
    Traceback (most recent call last):
    ...
    StopIteration
    """
    head, seq = peek(it)
    tail = drop(1, seq)
    return head, tail
Exemple #8
0
def build_graph_from_terms(terms,
                           *,
                           normalize="lemma",
                           window_size=10,
                           edge_weighting="count"):
    """
    Transform an ordered list of non-overlapping terms into a graph,
    where each term is represented by a node with weighted edges linking it to
    other terms that co-occur within ``window_size`` terms of itself.

    Args:
        terms (List[str] or List[:class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`])
        normalize (str or Callable): If "lemma", lemmatize terms; if "lower",
            lowercase terms; if falsy, use the form of terms as they appear
            in ``terms``; if a callable, must accept a ``Token`` and return
            a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`.

            .. note:: This is applied to the elements of ``terms`` *only* if
               it's a list of ``Token`` or ``Span``.

        window_size (int): Size of sliding window over ``terms`` that determines
            which are said to co-occur. If 2, only immediately adjacent terms
            have edges in the returned network.
        edge_weighting ({"count", "binary"}): If "count", the nodes for
            all co-occurring terms are connected by edges with weight equal to
            the number of times they co-occurred within a sliding window;
            if "binary", all such edges have weight = 1.

    Returns:
        :class:`networkx.Graph`: Nodes in this network correspond to individual terms;
        those that co-occur are connected by edges with weights determined
        by ``edge_weighting``.
    """
    if window_size < 2:
        raise ValueError(
            "window_size = {} is invalid; value must be >= 2".format(
                window_size))
    if not terms:
        LOGGER.warning("input `terms` is empty, so output graph is also empty")
        return nx.Graph()

    # if len(terms) < window_size, cytoolz throws a StopIteration error; prevent it
    if len(terms) < window_size:
        LOGGER.info(
            "`terms` has fewer items (%s) than `window_size` (%s); "
            "setting window width to %s",
            len(terms),
            window_size,
            len(terms),
        )
        window_size = len(terms)

    first_term, terms = itertoolz.peek(terms)
    if isinstance(first_term, str):
        windows = itertoolz.sliding_window(window_size, terms)
    elif isinstance(first_term, (Span, Token)):
        windows = itertoolz.sliding_window(
            window_size, ke_utils.normalize_terms(terms, normalize))
    else:
        raise TypeError(
            "items in `terms` must be strings or spacy tokens, not {}".format(
                type(first_term)))

    graph = nx.Graph()
    if edge_weighting == "count":
        cooc_mat = collections.Counter(
            w1_w2 for window in windows
            for w1_w2 in itertools.combinations(sorted(window), 2))
        graph.add_edges_from((w1, w2, {
            "weight": weight
        }) for (w1, w2), weight in cooc_mat.items())
    elif edge_weighting == "binary":
        graph.add_edges_from(w1_w2 for window in windows
                             for w1_w2 in itertools.combinations(window, 2))
    else:
        raise ValueError(
            "edge_weighting = {} is invalid; must be one of {}".format(
                edge_weighting, {"count", "binary"}))

    return graph