Exemple #1
0
def terms_to_semantic_network(terms,
                              window_width=10,
                              edge_weighting='cooc_freq'):
    """
    Convert an ordered list of non-overlapping terms into a semantic network,
    where each terms is represented by a node with edges linking it to other terms
    that co-occur within ``window_width`` terms of itself.

    Args:
        terms (list(str) or list(``spacy.Token``))
        window_width (int, optional): size of sliding window over `terms` that
            determines which are said to co-occur; if = 2, only adjacent terms
            will have edges in network
        edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary',
            all co-occurring terms will have network edges with weight = 1;
            if 'cooc_freq', edges will have a weight equal to the number of times
            that the connected nodes co-occur in a sliding window

    Returns:
        :class:`networkx.Graph()`

    Notes:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or spacy.Tokens beforehand
        - If terms are already strings, be sure to normalize so that like terms
          are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`)
    """
    if window_width < 2:
        raise ValueError('Window width must be >= 2.')

    if isinstance(terms[0], str):
        windows = itertoolz.sliding_window(window_width, terms)
    elif isinstance(terms[0], spacy_token):
        windows = ((normalized_str(tok) for tok in window)
                   for window in itertoolz.sliding_window(window_width, terms))
    else:
        msg = 'Input terms must be strings or spacy Tokens, not {}.'.format(
            type(terms[0]))
        raise TypeError(msg)

    graph = nx.Graph()

    if edge_weighting == 'cooc_freq':
        cooc_mat = defaultdict(lambda: defaultdict(int))
        for window in windows:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1
        graph.add_edges_from((w1, w2, {
            'weight': cooc_mat[w1][w2]
        }) for w1, w2s in cooc_mat.items() for w2 in w2s)

    elif edge_weighting == 'binary':
        graph.add_edges_from(w1_w2 for window in windows
                             for w1_w2 in itertools.combinations(window, 2))

    return graph
Exemple #2
0
def terms_to_semantic_network(terms,
                              window_width=10,
                              edge_weighting='cooc_freq'):
    """
    Convert an ordered list of non-overlapping terms into a semantic network,
    where each terms is represented by a node with edges linking it to other terms
    that co-occur within ``window_width`` terms of itself.

    Args:
        terms (list(str) or list(``spacy.Token``))
        window_width (int, optional): size of sliding window over `terms` that
            determines which are said to co-occur; if = 2, only adjacent terms
            will have edges in network
        edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary',
            all co-occurring terms will have network edges with weight = 1;
            if 'cooc_freq', edges will have a weight equal to the number of times
            that the connected nodes co-occur in a sliding window

    Returns:
        :class:`networkx.Graph()`

    Notes:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or spacy.Tokens beforehand
        - If terms are already strings, be sure to normalize so that like terms
          are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`)
    """
    if window_width < 2:
        raise ValueError('Window width must be >= 2.')

    if isinstance(terms[0], str):
        windows = itertoolz.sliding_window(window_width, terms)
    elif isinstance(terms[0], spacy_token):
        windows = ((normalized_str(tok) for tok in window)
                   for window in itertoolz.sliding_window(window_width, terms))
    else:
        msg = 'Input terms must be strings or spacy Tokens, not {}.'.format(type(terms[0]))
        raise TypeError(msg)

    graph = nx.Graph()

    if edge_weighting == 'cooc_freq':
        cooc_mat = defaultdict(lambda: defaultdict(int))
        for window in windows:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1
        graph.add_edges_from(
            (w1, w2, {'weight': cooc_mat[w1][w2]})
            for w1, w2s in cooc_mat.items() for w2 in w2s)

    elif edge_weighting == 'binary':
        graph.add_edges_from(
            w1_w2 for window in windows
            for w1_w2 in itertools.combinations(window, 2))

    return graph
Exemple #3
0
 def context_and_token(
         self, tokens: Iterable[Token]
 ) -> Iterable[Tuple[Sequence[Token], Token]]:
     padding = [self.codec.PAD] * self.context_size
     for window in sliding_window(self.context_size + 1,
                                  concat([padding, tokens, padding])):
         yield (window[:-1], window[-1])
Exemple #4
0
def main():
    input_ = load_from_file("day9_input.txt")
    numbers = [int(line) for line in input_]

    for i in range(25, len(numbers)):
        if numbers[i] not in nondiagonal_sums(numbers[i - 25:i]):
            sol_pt1 = numbers[i]
            index_ = i
            break

    print(sol_pt1)
    assert sol_pt1 == 1492208709  # My solution

    preceeding_nums = numbers[:index_]
    sliding_window_sums = {}
    for i in range(2, 100):
        sliding_window_sums[i] = list(
            map(sum, sliding_window(i, preceeding_nums)))
        if sol_pt1 in sliding_window_sums[i]:
            index_of_sum = sliding_window_sums[i].index(sol_pt1)
            contigous_range = numbers[index_of_sum:index_of_sum + i]
            sol_pt2 = min(contigous_range) + max(contigous_range)
            break

    print(sol_pt2)
    assert sol_pt2 == 238243506  # My solution
Exemple #5
0
 def __init__(self, dimensions: List[int]):
     super(Classifier, self).__init__()
     units = []
     for from_dimension, to_dimension in sliding_window(2, dimensions):
         units.append(nn.Linear(from_dimension, to_dimension))
         units.append(nn.ReLU())
     self.classifier = nn.Sequential(*units[:-1])
     self.softmax = nn.LogSoftmax(dim=1)
Exemple #6
0
def swap_words(aug_toks, *, num=1, pos=None):
    """
    Randomly swap the positions of two *adjacent* words,
    up to ``num`` times or with a probability of ``num``.

    Args:
        aug_toks (List[:class:`AugTok`]): Sequence of tokens to augment
            through position swapping.
        num (int or float): If int, maximum number of adjacent word pairs to swap;
            if float, probability that a given word pair will be swapped.
        pos (str or Set[str]): Part of speech tag(s) of words to be considered
            for augmentation. If None, all words are considered.

    Returns:
        List[:class:`AugTok`]: New, augmented sequence of tokens.
    """
    _validate_aug_toks(aug_toks)
    pos = utils.to_collection(pos, str, set)
    # if we don't require _adjacent_ words, this does the trick
    # if not pos:
    #     pos = set(aug_tok.pos for aug_tok in aug_toks if aug_tok.is_word)
    # cand_idx_pairs = list(
    #     itertools.chain.from_iterable(
    #         itertools.combinations(
    #             (idx for idx, aug_tok in enumerate(aug_toks) if aug_tok.pos == pos_),
    #             2,
    #         )
    #         for pos_ in pos
    #     )
    # )
    cand_idxs = (idx for idx, aug_tok in enumerate(aug_toks)
                 if aug_tok.is_word and (pos is None or aug_tok.pos in pos))
    cand_idx_pairs = [(idx1, idx2)
                      for idx1, idx2 in itertoolz.sliding_window(2, cand_idxs)
                      if idx2 - idx1 == 1]
    rand_idx_pairs = _select_random_candidates(cand_idx_pairs, num)
    if not rand_idx_pairs:
        return aug_toks[:]

    new_aug_toks = aug_toks[:]
    for idx1, idx2 in rand_idx_pairs:
        tok1 = new_aug_toks[idx1]
        tok2 = new_aug_toks[idx2]
        new_aug_toks[idx1] = aug_utils.AugTok(
            text=tok2.text,
            ws=tok1.ws,
            pos=tok2.pos,
            is_word=tok2.is_word,
            syns=tok2.syns,
        )
        new_aug_toks[idx2] = aug_utils.AugTok(
            text=tok1.text,
            ws=tok2.ws,
            pos=tok1.pos,
            is_word=tok1.is_word,
            syns=tok1.syns,
        )
    return new_aug_toks
Exemple #7
0
def _randomly_segment_text(text: str, len_range: Tuple[int, int]) -> Iterable[str]:
    min_len, max_len = len_range
    idxs = []
    idx = 0
    while idx < len(text):
        idxs.append(idx)
        idx += random.randint(min_len, max_len)
    idxs.append(len(text))
    for idx_start, idx_end in itertoolz.sliding_window(2, idxs):
        yield text[idx_start : idx_end]
Exemple #8
0
def delete_words(
    aug_toks: List[aug_utils.AugTok],
    *,
    num: Union[int, float] = 1,
    pos: Optional[Union[str, Set[str]]] = None,
) -> List[aug_utils.AugTok]:
    """
    Randomly delete words,
    up to ``num`` times or with a probability of ``num``.

    Args:
        aug_toks: Sequence of tokens to augment through word deletion.
        num: If int, maximum number of words to delete;
            if float, probability that a given word will be deleted.
        pos: Part of speech tag(s) of words to be considered for augmentation.
            If None, all words are considered.

    Returns:
        New, augmented sequence of tokens.
    """
    _validate_aug_toks(aug_toks)
    pos = cast(Set[str], utils.to_collection(pos, str, set))
    # bail out on very short sentences to avoid clobbering meaning
    if len(aug_toks) < 3:
        return aug_toks[:]

    cand_idxs = [
        idx for idx, aug_tok in enumerate(aug_toks)
        if aug_tok.is_word and (pos is None or aug_tok.pos in pos) and idx > 0
    ]
    rand_idxs = set(_select_random_candidates(cand_idxs, num))
    if not rand_idxs:
        return aug_toks[:]

    new_aug_toks: List[aug_utils.AugTok] = []
    # NOTE: https://github.com/python/mypy/issues/5492
    padded_triplets = itertoolz.sliding_window(
        3,
        [None] + aug_toks + [None],  # type: ignore
    )
    for idx, (prev_tok, curr_tok, next_tok) in enumerate(padded_triplets):
        if idx in rand_idxs:
            # special case: word then [deleted word] then punctuation
            # give deleted word's whitespace to previous word
            if prev_tok and next_tok and prev_tok.is_word and not next_tok.is_word:
                new_aug_toks[-1] = aug_utils.AugTok(
                    text=prev_tok.text,
                    ws=curr_tok.ws,
                    pos=prev_tok.pos,
                    is_word=prev_tok.is_word,
                    syns=prev_tok.syns,
                )
        else:
            new_aug_toks.append(curr_tok)
    return new_aug_toks
Exemple #9
0
def iter_skip_window_walk(
    walk: List[Hashable], window_size: int
) -> Iterable[Tuple[int, int]]:
    """
    Given a walk of nodes and a window size, which is interpreted as number of nodes to the left and to the right
    of the node, iteratively yield the central node and a choice of target node from its windows to the left and
    right in the walk.

    :param walk: list of nodes
    :param window_size: number of nodes to the left and to the right
    :return: yields 2-tuples of source and target for training
    """
    for window in sliding_window(2 * window_size + 1, walk):
        for target in window[:window_size] + window[window_size + 1 :]:
            yield (window[window_size], target)
Exemple #10
0
    def create_word_word_embedding(self):
        print("    Counting word-word co-occurrences in {}-word moving window".
              format(self.window_size))
        count_matrix = np.zeros([self.num_vocab, self.num_vocab])

        for i in range(self.corpus.num_documents):
            current_token_list = self.corpus.document_token_list[i] + [
                '*PAD*'
            ] * self.window_size
            windows = itertoolz.sliding_window(self.window_size + 1,
                                               current_token_list)
            #  [(1,2,3,4), (2,3,4,5), ...]
            # flat 1 1 1
            # lin  3 2 1
            # nlin 4 2 1
            counter = 0
            for w in windows:
                for t1, t2, dist in zip([w[0]] * self.window_size, w[1:],
                                        range(self.window_size)):
                    # [1, 1, 1], [2, 3, 4], [0, 1, 2] ---> [(1,2,0), (1,3,1), (1,4,2)]
                    if t1 == '*PAD*' or t2 == '*PAD*':
                        continue
                    if t1 not in self.vocab_index_dict:
                        t1 = "UNKNOWN"
                    if t2 not in self.vocab_index_dict:
                        t2 = "UNKNOWN"
                    t1_id = self.vocab_index_dict[t1]
                    t2_id = self.vocab_index_dict[t2]

                    if self.window_weight == "linear":
                        count_matrix[t1_id, t2_id] += self.window_size - dist
                    elif self.window_weight == "flat":
                        count_matrix[t1_id, t2_id] += 1
                    else:
                        raise AttributeError('Invalid arg to "window_weight".')
                counter += 1

        # window_type
        if self.window_type == 'forward':
            self.vocab_embedding_matrix = count_matrix
        elif self.window_type == 'backward':
            self.vocab_embedding_matrix = count_matrix.transpose()
        elif self.window_type == 'summed':
            self.vocab_embedding_matrix = count_matrix + count_matrix.transpose(
            )
        else:
            raise AttributeError('Invalid arg to "window_type".')
Exemple #11
0
def delete_words(aug_toks, *, num=1, pos=None):
    """
    Randomly delete words,
    up to ``num`` times or with a probability of ``num``.

    Args:
        aug_toks (List[:class:`AugTok`]): Sequence of tokens to augment
            through word deletion.
        num (int or float): If int, maximum number of words to delete;
            if float, probability that a given word will be deleted.
        pos (str or Set[str]): Part of speech tag(s) of words to be considered
            for augmentation. If None, all words are considered.

    Returns:
        List[:class:`AugTok`]: New, augmented sequence of tokens.
    """
    _validate_aug_toks(aug_toks)
    pos = utils.to_collection(pos, str, set)
    # bail out on very short sentences to avoid clobbering meaning
    if len(aug_toks) < 3:
        return aug_toks[:]

    cand_idxs = [
        idx for idx, aug_tok in enumerate(aug_toks)
        if aug_tok.is_word and (pos is None or aug_tok.pos in pos)
    ]
    rand_idxs = set(_select_random_candidates(cand_idxs, num))
    if not rand_idxs:
        return aug_toks[:]

    new_aug_toks = []
    padded_triplets = itertoolz.sliding_window(3, [None] + aug_toks + [None])
    for idx, (prev_tok, curr_tok, next_tok) in enumerate(padded_triplets):
        if idx in rand_idxs:
            # special case: word then [deleted word] then punctuation
            # give deleted word's whitespace to previous word
            if prev_tok and next_tok and prev_tok.is_word and not next_tok.is_word:
                new_aug_toks[-1] = aug_utils.AugTok(
                    text=prev_tok.text,
                    ws=curr_tok.ws,
                    pos=prev_tok.pos,
                    is_word=prev_tok.is_word,
                    syns=prev_tok.syns,
                )
        else:
            new_aug_toks.append(curr_tok)
    return new_aug_toks
Exemple #12
0
def build_units(dimensions: Iterable[int], activation: Optional[torch.nn.Module]) -> List[torch.nn.Module]:
    """
    Given a list of dimensions and optional activation, return a list of units where each unit is a linear
    layer followed by an activation layer.

    :param dimensions: iterable of dimensions for the chain
    :param activation: activation layer to use e.g. nn.ReLU, set to None to disable
    :return: list of instances of Sequential
    """
    def single_unit(in_dimension: int, out_dimension: int) -> torch.nn.Module:
        unit = [('linear', nn.Linear(in_dimension, out_dimension))]
        if activation is not None:
            unit.append(('activation', activation))
        return nn.Sequential(OrderedDict(unit))
    return [
        single_unit(embedding_dimension, hidden_dimension)
        for embedding_dimension, hidden_dimension
        in sliding_window(2, dimensions)
    ]
Exemple #13
0
def _get_per_word_occurrence_values(doc, normalize, stop_words, window_size):
    """
    Get base values for each individual occurrence of a word, to be aggregated
    and combined into a per-word score.

    Args:
        doc (:class:`spacy.tokens.Doc`)
        normalize (str)
        stop_words (Set[str])
        window_size (int)

    Returns:
        Dict[int, Dict[str, list]]
    """
    word_occ_vals = collections.defaultdict(
        lambda: collections.defaultdict(list))

    def _is_upper_cased(tok):
        return tok.is_upper or (tok.is_title and not tok.is_sent_start)

    attr_name = _get_attr_name(normalize, False)
    padding = [None] * window_size
    for sent_idx, sent in enumerate(doc.sents):
        sent_padded = itertoolz.concatv(padding, sent, padding)
        for window in itertoolz.sliding_window(1 + (2 * window_size),
                                               sent_padded):
            lwords, word, rwords = window[:window_size], window[
                window_size], window[window_size + 1:]
            w_id = getattr(word, attr_name)
            if word.is_stop:
                stop_words.add(w_id)
            word_occ_vals[w_id]["is_uc"].append(_is_upper_cased(word))
            word_occ_vals[w_id]["sent_idx"].append(sent_idx)
            word_occ_vals[w_id]["l_context"].extend(
                getattr(w, attr_name) for w in lwords
                if not (w is None or w.is_punct or w.is_space))
            word_occ_vals[w_id]["r_context"].extend(
                getattr(w, attr_name) for w in rwords
                if not (w is None or w.is_punct or w.is_space))
    return word_occ_vals
Exemple #14
0
def get_ngram_candidates(
    doc: Doc,
    ns: int | Collection[int],
    *,
    include_pos: Optional[str | Collection[str]] = ("NOUN", "PROPN", "ADJ"),
) -> Iterable[Tuple[Token, ...]]:
    """
    Get candidate keyterms from ``doc``, where candidates are n-length sequences
    of tokens (for all n in ``ns``) that don't start/end with a stop word or
    contain punctuation tokens, and whose constituent tokens are filtered by POS tag.

    Args:
        doc
        ns: One or more n values for which to generate n-grams. For example,
            ``2`` gets bigrams; ``(2, 3)`` gets bigrams and trigrams.
        include_pos: One or more POS tags with which to filter ngrams.
            If None, include tokens of all POS tags.

    Yields:
        Next ngram candidate, as a tuple of constituent Tokens.

    See Also:
        :func:`textacy.extract.ngrams()`
    """
    ns = utils.to_collection(ns, int, tuple)
    include_pos = utils.to_collection(include_pos, str, set)
    ngrams = itertoolz.concat(itertoolz.sliding_window(n, doc) for n in ns)
    ngrams = (
        ngram
        for ngram in ngrams
        if not (ngram[0].is_stop or ngram[-1].is_stop)
        and not any(word.is_punct or word.is_space for word in ngram)
    )
    if include_pos:
        ngrams = (
            ngram for ngram in ngrams if all(word.pos_ in include_pos for word in ngram)
        )
    for ngram in ngrams:
        yield ngram
Exemple #15
0
def create_ww_matrix(vocab_list, vocab_index_dict, tokens, encoding):  # no function call overhead - twice as fast
    window_type = encoding['window_type']
    window_size = encoding['window_size']
    window_weight = encoding['window_weight']
    # count
    num_vocab = len(vocab_list)
    count_matrix = np.zeros([num_vocab, num_vocab])
    if VERBOSE:
        print('\nCounting word-word co-occurrences in {}-word moving window'.format(window_size))

    for i in range(window_size):
        tokens.append(PAD)

    windows = itertoolz.sliding_window(window_size + 1, tokens)  # + 1 because window consists of t2s only
    for window in windows:
        # print(window)
        if window[0] in vocab_index_dict:
            for i in range(window_size):
                if window[i+1] in vocab_index_dict:
                    dist = 1/(i+1)
                    if window_weight == "linear":
                        count_matrix[vocab_index_dict[window[0]], vocab_index_dict[window[i+1]]] += dist
                    elif window_weight == "flat":
                        count_matrix[vocab_index_dict[window[0]], vocab_index_dict[window[i+1]]] += 1
    # window_type
    if window_type == 'forward':
        final_matrix = count_matrix
    elif window_type == 'backward':
        final_matrix = count_matrix.transpose()
    elif window_type == 'summed':
        final_matrix = count_matrix + count_matrix.transpose()
    elif window_type == 'concatenated':
        final_matrix = np.concatenate((count_matrix, count_matrix.transpose()), axis=1)
    else:
        raise AttributeError('Invalid arg to "window_type".')
    #  print('Shape of normalized matrix={}'.format(final_matrix.shape))
    return final_matrix
Exemple #16
0
def insert_word_synonyms(aug_toks, *, num=1, pos=None):
    """
    Randomly insert random synonyms of tokens for which synonyms are available,
    up to ``num`` times or with a probability of ``num``.

    Args:
        aug_toks (List[:obj:`AugTok`]): Sequence of tokens to augment
            through synonym insertion.
        num (int or float): If int, maximum number of words with available synonyms
            from which a random synonym is selected and randomly inserted; if float,
            probability that a given word with synonyms will provide a synonym
            to be inserted.
        pos (str or Set[str]): Part of speech tag(s) of words to be considered
            for augmentation. If None, all words with synonyms are considered.

    Returns:
        List[:obj:`AugTok`]: New, augmented sequence of tokens.

    Note:
        This transform requires :class:`textacy.resources.ConceptNet` to be downloaded
        to work properly, since this is the data source for word synonyms to be inserted.
    """
    _validate_aug_toks(aug_toks)
    pos = utils.to_collection(pos, str, set)
    # bail out on very short sentences to avoid clobbering meaning
    if len(aug_toks) < 3:
        return aug_toks[:]

    cand_aug_toks = [
        aug_tok for aug_tok in aug_toks
        if aug_tok.syns and (pos is None or aug_tok.pos in pos)
    ]
    rand_aug_toks = _select_random_candidates(cand_aug_toks, num)
    rand_idxs = random.sample(range(len(aug_toks)), len(rand_aug_toks))
    if not rand_idxs:
        return aug_toks[:]

    rand_aug_toks = iter(rand_aug_toks)
    new_aug_toks = []
    for idx, (prev_tok, curr_tok) in enumerate(
            itertoolz.sliding_window(2, [None] + aug_toks)):
        if idx in rand_idxs:
            rand_aug_tok = next(rand_aug_toks)
            if prev_tok:
                # use previous token's whitespace for inserted synonym
                new_tok_ws = prev_tok.ws
                if prev_tok.is_word and not prev_tok.ws:
                    # previous token should have whitespace, if a word
                    new_aug_toks[-1] = aug_utils.AugTok(
                        text=prev_tok.text,
                        ws=" ",
                        pos=prev_tok.pos,
                        is_word=True,
                        syns=prev_tok.syns,
                    )
            else:
                new_tok_ws = " "
            new_aug_toks.append(
                aug_utils.AugTok(
                    text=random.choice(rand_aug_tok.syns),
                    ws=new_tok_ws,
                    pos=rand_aug_tok.pos,
                    is_word=rand_aug_tok.is_word,
                    syns=rand_aug_tok.syns,  # TODO: re-fetch syns? use []?
                ))
        new_aug_toks.append(curr_tok)
    return new_aug_toks
Exemple #17
0
def build_cooccurrence_network(
        data: Sequence[str] | Sequence[Sequence[str]],
        *,
        window_size: int = 2,
        edge_weighting: str = "count",  # Literal["count", "binary"]
) -> nx.Graph:
    """
    Transform an ordered sequence of strings (or a sequence of such sequences)
    into a graph, where each string is represented by a node with weighted edges
    linking it to other strings that co-occur within ``window_size`` elements of itself.

    Input ``data`` can take a variety of forms. For example, as a ``Sequence[str]``
    where elements are token or term strings from a single document:

    .. code-block:: pycon

        >>> texts = [
        ...     "Mary had a little lamb. Its fleece was white as snow.",
        ...     "Everywhere that Mary went the lamb was sure to go.",
        ... ]
        >>> docs = [make_spacy_doc(text, lang="en_core_web_sm") for text in texts]
        >>> data = [tok.text for tok in docs[0]]
        >>> graph = build_cooccurrence_network(data, window_size=2)
        >>> sorted(graph.adjacency())[0]
        ('.', {'lamb': {'weight': 1}, 'Its': {'weight': 1}, 'snow': {'weight': 1}})

    Or as a ``Sequence[Sequence[str]]``, where elements are token or term strings
    per sentence from a single document:

    .. code-block:: pycon

        >>> data = [[tok.text for tok in sent] for sent in docs[0].sents]
        >>> graph = build_cooccurrence_network(data, window_size=2)
        >>> sorted(graph.adjacency())[0]
        ('.', {'lamb': {'weight': 1}, 'snow': {'weight': 1}})

    Or as a ``Sequence[Sequence[str]]``, where elements are token or term strings
    per document from multiple documents:

    .. code-block:: pycon

        >>> data = [[tok.text for tok in doc] for doc in docs]
        >>> graph = build_cooccurrence_network(data, window_size=2)
        >>> sorted(graph.adjacency())[0]
        ('.',
         {'lamb': {'weight': 1},
          'Its': {'weight': 1},
          'snow': {'weight': 1},
          'go': {'weight': 1}})

    Note how the "." token's connections to other nodes change for each case. (Note
    that in real usage, you'll probably want to remove stopwords, punctuation, etc.
    so that nodes in the graph represent meaningful concepts.)

    Args:
        data
        window_size: Size of sliding window over ``data`` that determines
            which strings are said to co-occur. For example, a value of 2 means that
            only immediately adjacent strings will have edges in the network;
            larger values loosen the definition of co-occurrence and typically
            lead to a more densely-connected network.

            .. note:: Co-occurrence windows are not permitted to cross sequences.
               So, if ``data`` is a ``Sequence[Sequence[str]]``, then co-occ counts
               are computed separately for each sub-sequence, then summed together.

        edge_weighting: Method by which edges between nodes are weighted.
            If "count", nodes are connected by edges with weights equal to
            the number of times they co-occurred within a sliding window;
            if "binary", all such edges have weight set equal to 1.

    Returns:
        Graph whose nodes correspond to individual strings from ``data``;
        those that co-occur are connected by edges with weights determined
        by ``edge_weighting``.

    Reference:
        https://en.wikipedia.org/wiki/Co-occurrence_network
    """
    if not data:
        LOGGER.warning("input `data` is empty, so output graph is also empty")
        return nx.Graph()

    if window_size < 2:
        raise ValueError(
            f"window_size = {window_size} is invalid; value must be >= 2")

    # input data is Sequence[str]
    if isinstance(data[0], str):
        windows = itertoolz.sliding_window(min(window_size, len(data)), data)
    # input data is Sequence[Sequence[str]]
    elif isinstance(data[0], Sequence) and isinstance(data[0][0], str):
        windows = itertoolz.concat(
            itertoolz.sliding_window(min(window_size, len(subseq)), subseq)
            for subseq in data)
    else:
        raise TypeError(
            errors.type_invalid_msg(
                "data", data, Union[Sequence[str], Sequence[Sequence[str]]]))

    graph = nx.Graph()
    if edge_weighting == "count":
        cooc_counts = collections.Counter(
            w1_w2 for window in windows
            for w1_w2 in itertools.combinations(sorted(window), 2))
        graph.add_edges_from((w1, w2, {
            "weight": weight
        }) for (w1, w2), weight in cooc_counts.items())
    elif edge_weighting == "binary":
        edge_data = {"weight": 1}
        graph.add_edges_from((w1, w2, edge_data) for window in windows
                             for (w1, w2) in itertools.combinations(window, 2))
    else:
        raise ValueError(
            errors.value_invalid_msg("edge_weighting", edge_weighting,
                                     {"count", "binary"}))

    return graph
def co_occurence_matrix(target_index_dict, age_index_dict, childesdb_data):

    print('Creating co-occurrence matrices')
    window_type = 'forward'  # forward, backward, summed, concatenated
    window_size = 7
    window_weight = 'flat'  # linear or flat
    PAD = '*PAD*'

    # The goal is to create a 3 dimensional array of the following x,y,z dimensions: MCDI words X MCDI words X Age
    num_targets = len(target_index_dict)
    num_ages = len(age_index_dict)
    cooc_matrix_by_age_list = []
    cumulative_cooc_matrix_by_age_list = []

    corpus_by_age_list = []
    for i in range(num_ages):
        corpus_by_age_list.append([])

    # Then specify what items (words) will be updating the correct row and columns.
    for i in range(len(childesdb_data)):
        utterance = childesdb_data[i][3]
        age = childesdb_data[i][2]
        age_index = age_index_dict[age]
        corpus_by_age_list[age_index] += utterance

    # now we are ready to start counting co-occurrences for each age

    for i in range(num_ages):
        cooc_matrix = np.zeros([num_targets, num_targets], float)
        cumulative_cooc_matrix = np.zeros([num_targets, num_targets], float)
        current_corpus = corpus_by_age_list[i]
        if len(current_corpus) > 0:
            current_corpus += [
                PAD
            ] * window_size  # add pad such that all co-occurrences in last window are captured
            windows = itertoolz.sliding_window(window_size, current_corpus)

            for w in windows:
                for word1, word2, dist in zip([w[0]] * (window_size - 1),
                                              w[1:], range(1, window_size)):
                    # increment
                    if word1 == PAD or word2 == PAD:
                        continue

                    if word1 not in target_index_dict:
                        continue

                    if word2 not in target_index_dict:
                        continue
                    word1_index = target_index_dict[word1]
                    word2_index = target_index_dict[word2]

                    if window_weight == "linear":
                        cooc_matrix[word1_index,
                                    word2_index] += window_size - dist
                    elif window_weight == "flat":
                        cooc_matrix[word1_index, word2_index] += 1
            # window_type
            if window_type == 'forward':
                final_matrix = cooc_matrix
            elif window_type == 'backward':
                final_matrix = cooc_matrix.transpose()
            elif window_type == 'summed':
                final_matrix = cooc_matrix + cooc_matrix.transpose()
            elif window_type == 'concatenate':
                final_matrix = np.concatenate(
                    (cooc_matrix, cooc_matrix.transpose()))
            else:
                raise AttributeError('Invalid arg to "window_type".')

            cooc_matrix_by_age_list.append(final_matrix)

    current_cumul_cooc_matrix = np.zeros([num_targets, num_targets], float)

    for i in range(num_ages):
        current_cooc_matrix = cooc_matrix_by_age_list[i]
        current_cumul_cooc_matrix += current_cooc_matrix
        cumulative_cooc_matrix_by_age_list.append(
            current_cumul_cooc_matrix.copy())

    return cooc_matrix_by_age_list, cumulative_cooc_matrix_by_age_list
Exemple #19
0
def terms_to_semantic_network(
    terms: Union[Sequence[str], Sequence[Token]],
    *,
    normalize: Union[str, bool, Callable[[Token], str]] = "lemma",
    window_width: int = 10,
    edge_weighting: str = "cooc_freq",
) -> nx.Graph:
    """
    Transform an ordered list of non-overlapping terms into a semantic network,
    where each term is represented by a node with weighted edges linking it to
    other terms that co-occur within ``window_width`` terms of itself.

    Args:
        terms
        normalize: If "lemma", lemmatize terms; if "lower", lowercase terms;
            if falsy, use the form of terms as they appear in ``terms``;
            if a callable, must accept a ``Token`` and return a str,
            e.g. :func:`textacy.spacier.utils.get_normalized_text()`.

            .. note:: This is applied to the elements of ``terms`` *only* if
               it's a list of ``Token``.

        window_width: Size of sliding window over ``terms`` that determines
            which are said to co-occur. If 2, only immediately adjacent terms
            have edges in the returned network.
        edge_weighting ({'cooc_freq', 'binary'}): If 'cooc_freq', the nodes for
            all co-occurring terms are connected by edges with weight equal to
            the number of times they co-occurred within a sliding window;
            if 'binary', all such edges have weight = 1.

    Returns:
        Networkx graph whose nodes represent individual terms, connected by edges
        based on term co-occurrence with weights determined by ``edge_weighting``.

    Note:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or ``Token`` s beforehand
        - If terms are already strings, be sure to have normalized them so that
          like terms are counted together; for example, by applying
          :func:`textacy.spacier.utils.get_normalized_text()`
    """
    if window_width < 2:
        raise ValueError(
            "`window_width` = {} is invalid; value must be >= 2".format(
                window_width))
    if not terms:
        LOGGER.warning("input `terms` is empty, so output graph is also empty")
        return nx.Graph()

    # if len(terms) < window_width, cytoolz throws a StopIteration error
    # which we don't want
    if len(terms) < window_width:
        LOGGER.info(
            "`terms` has fewer items (%s) than the specified `window_width` (%s); "
            "setting window width to %s",
            len(terms),
            window_width,
            len(terms),
        )
        window_width = len(terms)

    if isinstance(terms[0], str):
        windows = itertoolz.sliding_window(window_width, terms)
    elif isinstance(terms[0], Token):
        if normalize == "lemma":
            windows = (
                (tok.lemma_ for tok in window)
                for window in itertoolz.sliding_window(window_width, terms))
        elif normalize == "lower":
            windows = (
                (tok.lower_ for tok in window)
                for window in itertoolz.sliding_window(window_width, terms))
        elif not normalize:
            windows = (
                (tok.text for tok in window)
                for window in itertoolz.sliding_window(window_width, terms))
        else:
            windows = (
                (normalize(tok) for tok in window)
                for window in itertoolz.sliding_window(window_width, terms))
    else:
        raise TypeError(
            "items in `terms` must be strings or spacy tokens, not {}".format(
                type(terms[0])))

    graph = nx.Graph()

    if edge_weighting == "cooc_freq":
        cooc_mat: DefaultDict[str, DefaultDict[str, int]]
        cooc_mat = collections.defaultdict(
            lambda: collections.defaultdict(int))
        for window in windows:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1
        graph.add_edges_from((w1, w2, {
            "weight": weight
        }) for w1, w2s in cooc_mat.items() for w2, weight in w2s.items())
    elif edge_weighting == "binary":
        graph.add_edges_from(w1_w2 for window in windows
                             for w1_w2 in itertools.combinations(window, 2))

    return graph
Exemple #20
0
def test_sliding_window_of_short_iterator():
    assert list(sliding_window(3, [1, 2])) == []
Exemple #21
0
def scake(
    doc: Doc,
    *,
    normalize: Optional[Union[str, Callable[[Token], str]]] = "lemma",
    include_pos: Optional[Union[str,
                                Collection[str]]] = ("NOUN", "PROPN", "ADJ"),
    topn: Union[int, float] = 10,
) -> List[Tuple[str, float]]:
    """
    Extract key terms from a document using the sCAKE algorithm.

    Args:
        doc: spaCy ``Doc`` from which to extract keyterms. Must be sentence-segmented;
            optionally POS-tagged.
        normalize: If "lemma", lemmatize terms; if "lower", lowercase terms; if None,
            use the form of terms as they appeared in ``doc``; if a callable,
            must accept a ``Token`` and return a str,
            e.g. :func:`textacy.spacier.utils.get_normalized_text()`.
        include_pos: One or more POS tags with which to filter for good candidate keyterms.
            If None, include tokens of all POS tags
            (which also allows keyterm extraction from docs without POS-tagging.)
        topn: Number of top-ranked terms to return as key terms.
            If an integer, represents the absolute number; if a float, value
            must be in the interval (0.0, 1.0], which is converted to an int by
            ``int(round(len(candidates) * topn))``

    Returns:
        Sorted list of top ``topn`` key terms and their corresponding scores.

    References:
        Duari, Swagata & Bhatnagar, Vasudha. (2018). sCAKE: Semantic Connectivity
        Aware Keyword Extraction. Information Sciences. 477.
        https://arxiv.org/abs/1811.10831v1
    """
    # validate / transform args
    include_pos = cast(Set[str], utils.to_collection(include_pos, str, set))
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "topn={} is invalid; "
                "must be an int, or a float between 0.0 and 1.0".format(topn))

    # bail out on empty docs
    if not doc:
        return []

    # build up a graph of good words, edges weighting by adjacent sentence co-occurrence
    cooc_mat: Counter[Tuple[str, str]] = collections.Counter()
    # handle edge case where doc only has 1 sentence
    n_sents = itertoolz.count(doc.sents)
    for window_sents in itertoolz.sliding_window(min(2, n_sents), doc.sents):
        if n_sents == 1:
            window_sents = (window_sents[0], [])
        window_words: Iterable[str] = (
            word for word in itertoolz.concat(window_sents)
            if not (word.is_stop or word.is_punct or word.is_space) and (
                not include_pos or word.pos_ in include_pos))
        window_words = ke_utils.normalize_terms(window_words, normalize)
        cooc_mat.update(
            w1_w2 for w1_w2 in itertools.combinations(sorted(window_words), 2)
            if w1_w2[0] != w1_w2[1])
    # doc doesn't have any valid words...
    if not cooc_mat:
        return []

    graph = nx.Graph()
    graph.add_edges_from((w1, w2, {
        "weight": weight
    }) for (w1, w2), weight in cooc_mat.items())

    word_scores = _compute_word_scores(doc, graph, cooc_mat, normalize)
    if not word_scores:
        return []

    # generate a list of candidate terms
    candidates = _get_candidates(doc, normalize, include_pos)
    if isinstance(topn, float):
        topn = int(round(len(set(candidates)) * topn))
    # rank candidates by aggregating constituent word scores
    candidate_scores = {
        " ".join(candidate):
        sum(word_scores.get(word, 0.0) for word in candidate)
        for candidate in candidates
    }
    sorted_candidate_scores = sorted(candidate_scores.items(),
                                     key=operator.itemgetter(1, 0),
                                     reverse=True)
    return ke_utils.get_filtered_topn_terms(sorted_candidate_scores,
                                            topn,
                                            match_threshold=0.8)
Exemple #22
0
def test_sliding_window_of_short_iterator():
    assert list(sliding_window(3, [1, 2])) == []
Exemple #23
0
def test_sliding_window():
    assert list(sliding_window(2, [1, 2, 3, 4])) == [(1, 2), (2, 3), (3, 4)]
    assert list(sliding_window(3, [1, 2, 3, 4])) == [(1, 2, 3), (2, 3, 4)]
Exemple #24
0
def get_sliding_windows(
    window_size: int,
    tokens: List[str],
) -> List[List[str]]:
    res = list(itertoolz.sliding_window(window_size, tokens))
    return res
Exemple #25
0
def test_sliding_window():
    assert list(sliding_window(2, [1, 2, 3, 4])) == [(1, 2), (2, 3), (3, 4)]
    assert list(sliding_window(3, [1, 2, 3, 4])) == [(1, 2, 3), (2, 3, 4)]
Exemple #26
0
def get_sliding_windows(window_size, tokens):
    res = list(itertoolz.sliding_window(window_size, tokens))
    return res
Exemple #27
0
def build_graph_from_terms(terms,
                           *,
                           normalize="lemma",
                           window_size=10,
                           edge_weighting="count"):
    """
    Transform an ordered list of non-overlapping terms into a graph,
    where each term is represented by a node with weighted edges linking it to
    other terms that co-occur within ``window_size`` terms of itself.

    Args:
        terms (List[str] or List[:class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`])
        normalize (str or Callable): If "lemma", lemmatize terms; if "lower",
            lowercase terms; if falsy, use the form of terms as they appear
            in ``terms``; if a callable, must accept a ``Token`` and return
            a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`.

            .. note:: This is applied to the elements of ``terms`` *only* if
               it's a list of ``Token`` or ``Span``.

        window_size (int): Size of sliding window over ``terms`` that determines
            which are said to co-occur. If 2, only immediately adjacent terms
            have edges in the returned network.
        edge_weighting ({"count", "binary"}): If "count", the nodes for
            all co-occurring terms are connected by edges with weight equal to
            the number of times they co-occurred within a sliding window;
            if "binary", all such edges have weight = 1.

    Returns:
        :class:`networkx.Graph`: Nodes in this network correspond to individual terms;
        those that co-occur are connected by edges with weights determined
        by ``edge_weighting``.
    """
    if window_size < 2:
        raise ValueError(
            "window_size = {} is invalid; value must be >= 2".format(
                window_size))
    if not terms:
        LOGGER.warning("input `terms` is empty, so output graph is also empty")
        return nx.Graph()

    # if len(terms) < window_size, cytoolz throws a StopIteration error; prevent it
    if len(terms) < window_size:
        LOGGER.info(
            "`terms` has fewer items (%s) than `window_size` (%s); "
            "setting window width to %s",
            len(terms),
            window_size,
            len(terms),
        )
        window_size = len(terms)

    first_term, terms = itertoolz.peek(terms)
    if isinstance(first_term, str):
        windows = itertoolz.sliding_window(window_size, terms)
    elif isinstance(first_term, (Span, Token)):
        windows = itertoolz.sliding_window(
            window_size, ke_utils.normalize_terms(terms, normalize))
    else:
        raise TypeError(
            "items in `terms` must be strings or spacy tokens, not {}".format(
                type(first_term)))

    graph = nx.Graph()
    if edge_weighting == "count":
        cooc_mat = collections.Counter(
            w1_w2 for window in windows
            for w1_w2 in itertools.combinations(sorted(window), 2))
        graph.add_edges_from((w1, w2, {
            "weight": weight
        }) for (w1, w2), weight in cooc_mat.items())
    elif edge_weighting == "binary":
        graph.add_edges_from(w1_w2 for window in windows
                             for w1_w2 in itertools.combinations(window, 2))
    else:
        raise ValueError(
            "edge_weighting = {} is invalid; must be one of {}".format(
                edge_weighting, {"count", "binary"}))

    return graph
Exemple #28
0
def terms_to_semantic_network(terms,
                              normalize='lemma',
                              window_width=10,
                              edge_weighting='cooc_freq'):
    """
    Convert an ordered list of non-overlapping terms into a semantic network,
    where each term is represented by a node with edges linking it to other terms
    that co-occur within ``window_width`` terms of itself.

    Args:
        terms (List[str] or List[``spacy.Token``])
        normalize (str or callable): if 'lemma', lemmatize terms; if 'lower',
            lowercase terms; if false-y, use the form of terms as they appear
            in doc; if a callable, must accept a ``spacy.Token`` and return a
            str, e.g. :func:`textacy.spacy_utils.normalized_str()`;
            only applicable if ``terms`` is a List[``spacy.Token``]
        window_width (int, optional): size of sliding window over `terms` that
            determines which are said to co-occur; if = 2, only adjacent terms
            will have edges in network
        edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary',
            all co-occurring terms will have network edges with weight = 1;
            if 'cooc_freq', edges will have a weight equal to the number of times
            that the connected nodes co-occur in a sliding window

    Returns:
        ``networkx.Graph``: Nodes are terms, edges are for co-occurrences of terms.

    Notes:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or spacy.Tokens beforehand
        - If terms are already strings, be sure to have normalized them so that
          like terms are counted together; for example, by applying
          :func:`normalized_str() <textacy.spacy_utils.normalized_str>`
    """
    if window_width < 2:
        raise ValueError('Window width must be >= 2')
    if not terms:
        raise ValueError(
            '`terms` list is empty; it must contain 1 or more terms')

    # if len(terms) < window_width, cytoolz throws a StopIteration error
    # which we don't want
    if len(terms) < window_width:
        LOGGER.warning(
            'input terms list is smaller than window width (%s < %s)',
            len(terms), window_width)
        window_width = len(terms)

    if isinstance(terms[0], compat.unicode_):
        windows = itertoolz.sliding_window(window_width, terms)
    elif isinstance(terms[0], SpacyToken):
        if normalize == 'lemma':
            windows = (
                (tok.lemma_ for tok in window)
                for window in itertoolz.sliding_window(window_width, terms))
        elif normalize == 'lower':
            windows = (
                (tok.lower_ for tok in window)
                for window in itertoolz.sliding_window(window_width, terms))
        elif not normalize:
            windows = (
                (tok.text for tok in window)
                for window in itertoolz.sliding_window(window_width, terms))
        else:
            windows = (
                (normalize(tok) for tok in window)
                for window in itertoolz.sliding_window(window_width, terms))
    else:
        msg = 'Input terms must be strings or spacy Tokens, not {}.'.format(
            type(terms[0]))
        raise TypeError(msg)

    graph = nx.Graph()

    if edge_weighting == 'cooc_freq':
        cooc_mat = collections.defaultdict(
            lambda: collections.defaultdict(int))
        for window in windows:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1
        graph.add_edges_from((w1, w2, {
            'weight': cooc_mat[w1][w2]
        }) for w1, w2s in cooc_mat.items() for w2 in w2s)

    elif edge_weighting == 'binary':
        graph.add_edges_from(w1_w2 for window in windows
                             for w1_w2 in itertools.combinations(window, 2))

    return graph
Exemple #29
0
def make_sparse_ww_matrix(docs: Generator[List[str], None, None],
                          w2id: Dict[str, int],
                          window_size: int,
                          window_type: str,
                          window_weight: str,
                          max_num_docs: Optional[int] = None,
                          pad='*PAD*',
                          ) -> sparse.coo_matrix:

    print('Counting word-word co-occurrences in {}-word moving window'.format(window_size))

    # init lists for sparse matrix construction
    rows = []
    cols = []
    data = []

    if max_num_docs is not None:
        docs = islice(docs, max_num_docs)

    print(len(w2id))

    for tokens in docs:
        # pad tokens such that all co-occurrences in last window are captured
        padding = (pad for _ in range(window_size))
        tokens_padded = chain(tokens, padding)

        # + 1 because window consists of w2s only
        for window in itertoolz.sliding_window(window_size + 1, tokens_padded):

            for w1, w2, dist in zip([window[0]] * window_size, window[1:],
                                    range(window_size)):
                if w1 in w2id and w2 in w2id:

                    w1_id = w2id[w1]
                    w2_id = w2id[w2]
                    rows.append(w1_id)
                    cols.append(w2_id)
                    # increment
                    if w1_id == pad or w2_id == pad:
                        continue
                    if window_weight == "linear":
                        data.append(window_size - dist)
                    elif window_weight == "flat":
                        data.append(1)

    matrix = sparse.coo_matrix((np.array(data, dtype=np.int32), (rows, cols)))

    # window_type
    if window_type == 'forward':
        matrix = matrix
    elif window_type == 'backward':
        matrix = matrix.transpose()
    elif window_type == 'summed':
        matrix = matrix + matrix.transpose()
    elif window_type == 'concatenated':
        matrix = np.concatenate((matrix, matrix.transpose()), axis=1)
    else:
        raise AttributeError('Invalid arg to "window_type".')
    print('Shape of matrix={}'.format(matrix.shape))

    return matrix
Exemple #30
0
def terms_to_semantic_network(terms,
                              normalize='lemma',
                              window_width=10,
                              edge_weighting='cooc_freq'):
    """
    Transform an ordered list of non-overlapping terms into a semantic network,
    where each term is represented by a node with weighted edges linking it to
    other terms that co-occur within ``window_width`` terms of itself.

    Args:
        terms (List[str] or List[``spacy.Token``])
        normalize (str or Callable): If 'lemma', lemmatize terms; if 'lower',
            lowercase terms; if false-y, use the form of terms as they appear
            in ``terms``; if a callable, must accept a ``spacy.Token`` and return
            a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`.

            .. note:: This is applied to the elements of ``terms`` *only* if
               it's a list of ``spacy.Token``.

        window_width (int): Size of sliding window over ``terms`` that determines
            which are said to co-occur. If 2, only immediately adjacent terms
            have edges in the returned network.
        edge_weighting ({'cooc_freq', 'binary'}): If 'cooc_freq', the nodes for
            all co-occurring terms are connected by edges with weight equal to
            the number of times they co-occurred within a sliding window;
            if 'binary', all such edges have weight = 1.

    Returns:
        ``networkx.Graph``: Nodes in this network correspond to individual terms;
        those that co-occur are connected by edges with weights determined
        by ``edge_weighting``.

    Notes:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or spacy.Tokens beforehand
        - If terms are already strings, be sure to have normalized them so that
          like terms are counted together; for example, by applying
          :func:`textacy.spacier.utils.get_normalized_text()`
    """
    if window_width < 2:
        raise ValueError(
            '`window_width` = {} is invalid; value must be >= 2'.format(window_width))
    if not terms:
        raise ValueError(
            '`terms` = {} is invalid; it must contain at least 1 term '
            'in the form of a string or spacy token'.format(terms))

    # if len(terms) < window_width, cytoolz throws a StopIteration error
    # which we don't want
    if len(terms) < window_width:
        LOGGER.info(
            '`terms` has fewer items (%s) than the specified `window_width` (%s); '
            'setting window width to %s',
            len(terms), window_width, len(terms))
        window_width = len(terms)

    if isinstance(terms[0], compat.unicode_):
        windows = itertoolz.sliding_window(window_width, terms)
    elif isinstance(terms[0], SpacyToken):
        if normalize == 'lemma':
            windows = ((tok.lemma_ for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        elif normalize == 'lower':
            windows = ((tok.lower_ for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        elif not normalize:
            windows = ((tok.text for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
        else:
            windows = ((normalize(tok) for tok in window)
                       for window in itertoolz.sliding_window(window_width, terms))
    else:
        raise TypeError(
            'items in `terms` must be strings or spacy tokens, not {}'.format(type(terms[0])))

    graph = nx.Graph()

    if edge_weighting == 'cooc_freq':
        cooc_mat = collections.defaultdict(lambda: collections.defaultdict(int))
        for window in windows:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1
        graph.add_edges_from(
            (w1, w2, {'weight': weight})
            for w1, w2s in cooc_mat.items()
            for w2, weight in w2s.items())
    elif edge_weighting == 'binary':
        graph.add_edges_from(
            w1_w2 for window in windows
            for w1_w2 in itertools.combinations(window, 2))

    return graph