def terms_to_semantic_network(terms, window_width=10, edge_weighting='cooc_freq'): """ Convert an ordered list of non-overlapping terms into a semantic network, where each terms is represented by a node with edges linking it to other terms that co-occur within ``window_width`` terms of itself. Args: terms (list(str) or list(``spacy.Token``)) window_width (int, optional): size of sliding window over `terms` that determines which are said to co-occur; if = 2, only adjacent terms will have edges in network edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary', all co-occurring terms will have network edges with weight = 1; if 'cooc_freq', edges will have a weight equal to the number of times that the connected nodes co-occur in a sliding window Returns: :class:`networkx.Graph()` Notes: - Be sure to filter out stopwords, punctuation, certain parts of speech, etc. from the terms list before passing it to this function - Multi-word terms, such as named entities and compound nouns, must be merged into single strings or spacy.Tokens beforehand - If terms are already strings, be sure to normalize so that like terms are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`) """ if window_width < 2: raise ValueError('Window width must be >= 2.') if isinstance(terms[0], str): windows = itertoolz.sliding_window(window_width, terms) elif isinstance(terms[0], spacy_token): windows = ((normalized_str(tok) for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: msg = 'Input terms must be strings or spacy Tokens, not {}.'.format( type(terms[0])) raise TypeError(msg) graph = nx.Graph() if edge_weighting == 'cooc_freq': cooc_mat = defaultdict(lambda: defaultdict(int)) for window in windows: for w1, w2 in itertools.combinations(sorted(window), 2): cooc_mat[w1][w2] += 1 graph.add_edges_from((w1, w2, { 'weight': cooc_mat[w1][w2] }) for w1, w2s in cooc_mat.items() for w2 in w2s) elif edge_weighting == 'binary': graph.add_edges_from(w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) return graph
def terms_to_semantic_network(terms, window_width=10, edge_weighting='cooc_freq'): """ Convert an ordered list of non-overlapping terms into a semantic network, where each terms is represented by a node with edges linking it to other terms that co-occur within ``window_width`` terms of itself. Args: terms (list(str) or list(``spacy.Token``)) window_width (int, optional): size of sliding window over `terms` that determines which are said to co-occur; if = 2, only adjacent terms will have edges in network edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary', all co-occurring terms will have network edges with weight = 1; if 'cooc_freq', edges will have a weight equal to the number of times that the connected nodes co-occur in a sliding window Returns: :class:`networkx.Graph()` Notes: - Be sure to filter out stopwords, punctuation, certain parts of speech, etc. from the terms list before passing it to this function - Multi-word terms, such as named entities and compound nouns, must be merged into single strings or spacy.Tokens beforehand - If terms are already strings, be sure to normalize so that like terms are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`) """ if window_width < 2: raise ValueError('Window width must be >= 2.') if isinstance(terms[0], str): windows = itertoolz.sliding_window(window_width, terms) elif isinstance(terms[0], spacy_token): windows = ((normalized_str(tok) for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: msg = 'Input terms must be strings or spacy Tokens, not {}.'.format(type(terms[0])) raise TypeError(msg) graph = nx.Graph() if edge_weighting == 'cooc_freq': cooc_mat = defaultdict(lambda: defaultdict(int)) for window in windows: for w1, w2 in itertools.combinations(sorted(window), 2): cooc_mat[w1][w2] += 1 graph.add_edges_from( (w1, w2, {'weight': cooc_mat[w1][w2]}) for w1, w2s in cooc_mat.items() for w2 in w2s) elif edge_weighting == 'binary': graph.add_edges_from( w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) return graph
def context_and_token( self, tokens: Iterable[Token] ) -> Iterable[Tuple[Sequence[Token], Token]]: padding = [self.codec.PAD] * self.context_size for window in sliding_window(self.context_size + 1, concat([padding, tokens, padding])): yield (window[:-1], window[-1])
def main(): input_ = load_from_file("day9_input.txt") numbers = [int(line) for line in input_] for i in range(25, len(numbers)): if numbers[i] not in nondiagonal_sums(numbers[i - 25:i]): sol_pt1 = numbers[i] index_ = i break print(sol_pt1) assert sol_pt1 == 1492208709 # My solution preceeding_nums = numbers[:index_] sliding_window_sums = {} for i in range(2, 100): sliding_window_sums[i] = list( map(sum, sliding_window(i, preceeding_nums))) if sol_pt1 in sliding_window_sums[i]: index_of_sum = sliding_window_sums[i].index(sol_pt1) contigous_range = numbers[index_of_sum:index_of_sum + i] sol_pt2 = min(contigous_range) + max(contigous_range) break print(sol_pt2) assert sol_pt2 == 238243506 # My solution
def __init__(self, dimensions: List[int]): super(Classifier, self).__init__() units = [] for from_dimension, to_dimension in sliding_window(2, dimensions): units.append(nn.Linear(from_dimension, to_dimension)) units.append(nn.ReLU()) self.classifier = nn.Sequential(*units[:-1]) self.softmax = nn.LogSoftmax(dim=1)
def swap_words(aug_toks, *, num=1, pos=None): """ Randomly swap the positions of two *adjacent* words, up to ``num`` times or with a probability of ``num``. Args: aug_toks (List[:class:`AugTok`]): Sequence of tokens to augment through position swapping. num (int or float): If int, maximum number of adjacent word pairs to swap; if float, probability that a given word pair will be swapped. pos (str or Set[str]): Part of speech tag(s) of words to be considered for augmentation. If None, all words are considered. Returns: List[:class:`AugTok`]: New, augmented sequence of tokens. """ _validate_aug_toks(aug_toks) pos = utils.to_collection(pos, str, set) # if we don't require _adjacent_ words, this does the trick # if not pos: # pos = set(aug_tok.pos for aug_tok in aug_toks if aug_tok.is_word) # cand_idx_pairs = list( # itertools.chain.from_iterable( # itertools.combinations( # (idx for idx, aug_tok in enumerate(aug_toks) if aug_tok.pos == pos_), # 2, # ) # for pos_ in pos # ) # ) cand_idxs = (idx for idx, aug_tok in enumerate(aug_toks) if aug_tok.is_word and (pos is None or aug_tok.pos in pos)) cand_idx_pairs = [(idx1, idx2) for idx1, idx2 in itertoolz.sliding_window(2, cand_idxs) if idx2 - idx1 == 1] rand_idx_pairs = _select_random_candidates(cand_idx_pairs, num) if not rand_idx_pairs: return aug_toks[:] new_aug_toks = aug_toks[:] for idx1, idx2 in rand_idx_pairs: tok1 = new_aug_toks[idx1] tok2 = new_aug_toks[idx2] new_aug_toks[idx1] = aug_utils.AugTok( text=tok2.text, ws=tok1.ws, pos=tok2.pos, is_word=tok2.is_word, syns=tok2.syns, ) new_aug_toks[idx2] = aug_utils.AugTok( text=tok1.text, ws=tok2.ws, pos=tok1.pos, is_word=tok1.is_word, syns=tok1.syns, ) return new_aug_toks
def _randomly_segment_text(text: str, len_range: Tuple[int, int]) -> Iterable[str]: min_len, max_len = len_range idxs = [] idx = 0 while idx < len(text): idxs.append(idx) idx += random.randint(min_len, max_len) idxs.append(len(text)) for idx_start, idx_end in itertoolz.sliding_window(2, idxs): yield text[idx_start : idx_end]
def delete_words( aug_toks: List[aug_utils.AugTok], *, num: Union[int, float] = 1, pos: Optional[Union[str, Set[str]]] = None, ) -> List[aug_utils.AugTok]: """ Randomly delete words, up to ``num`` times or with a probability of ``num``. Args: aug_toks: Sequence of tokens to augment through word deletion. num: If int, maximum number of words to delete; if float, probability that a given word will be deleted. pos: Part of speech tag(s) of words to be considered for augmentation. If None, all words are considered. Returns: New, augmented sequence of tokens. """ _validate_aug_toks(aug_toks) pos = cast(Set[str], utils.to_collection(pos, str, set)) # bail out on very short sentences to avoid clobbering meaning if len(aug_toks) < 3: return aug_toks[:] cand_idxs = [ idx for idx, aug_tok in enumerate(aug_toks) if aug_tok.is_word and (pos is None or aug_tok.pos in pos) and idx > 0 ] rand_idxs = set(_select_random_candidates(cand_idxs, num)) if not rand_idxs: return aug_toks[:] new_aug_toks: List[aug_utils.AugTok] = [] # NOTE: https://github.com/python/mypy/issues/5492 padded_triplets = itertoolz.sliding_window( 3, [None] + aug_toks + [None], # type: ignore ) for idx, (prev_tok, curr_tok, next_tok) in enumerate(padded_triplets): if idx in rand_idxs: # special case: word then [deleted word] then punctuation # give deleted word's whitespace to previous word if prev_tok and next_tok and prev_tok.is_word and not next_tok.is_word: new_aug_toks[-1] = aug_utils.AugTok( text=prev_tok.text, ws=curr_tok.ws, pos=prev_tok.pos, is_word=prev_tok.is_word, syns=prev_tok.syns, ) else: new_aug_toks.append(curr_tok) return new_aug_toks
def iter_skip_window_walk( walk: List[Hashable], window_size: int ) -> Iterable[Tuple[int, int]]: """ Given a walk of nodes and a window size, which is interpreted as number of nodes to the left and to the right of the node, iteratively yield the central node and a choice of target node from its windows to the left and right in the walk. :param walk: list of nodes :param window_size: number of nodes to the left and to the right :return: yields 2-tuples of source and target for training """ for window in sliding_window(2 * window_size + 1, walk): for target in window[:window_size] + window[window_size + 1 :]: yield (window[window_size], target)
def create_word_word_embedding(self): print(" Counting word-word co-occurrences in {}-word moving window". format(self.window_size)) count_matrix = np.zeros([self.num_vocab, self.num_vocab]) for i in range(self.corpus.num_documents): current_token_list = self.corpus.document_token_list[i] + [ '*PAD*' ] * self.window_size windows = itertoolz.sliding_window(self.window_size + 1, current_token_list) # [(1,2,3,4), (2,3,4,5), ...] # flat 1 1 1 # lin 3 2 1 # nlin 4 2 1 counter = 0 for w in windows: for t1, t2, dist in zip([w[0]] * self.window_size, w[1:], range(self.window_size)): # [1, 1, 1], [2, 3, 4], [0, 1, 2] ---> [(1,2,0), (1,3,1), (1,4,2)] if t1 == '*PAD*' or t2 == '*PAD*': continue if t1 not in self.vocab_index_dict: t1 = "UNKNOWN" if t2 not in self.vocab_index_dict: t2 = "UNKNOWN" t1_id = self.vocab_index_dict[t1] t2_id = self.vocab_index_dict[t2] if self.window_weight == "linear": count_matrix[t1_id, t2_id] += self.window_size - dist elif self.window_weight == "flat": count_matrix[t1_id, t2_id] += 1 else: raise AttributeError('Invalid arg to "window_weight".') counter += 1 # window_type if self.window_type == 'forward': self.vocab_embedding_matrix = count_matrix elif self.window_type == 'backward': self.vocab_embedding_matrix = count_matrix.transpose() elif self.window_type == 'summed': self.vocab_embedding_matrix = count_matrix + count_matrix.transpose( ) else: raise AttributeError('Invalid arg to "window_type".')
def delete_words(aug_toks, *, num=1, pos=None): """ Randomly delete words, up to ``num`` times or with a probability of ``num``. Args: aug_toks (List[:class:`AugTok`]): Sequence of tokens to augment through word deletion. num (int or float): If int, maximum number of words to delete; if float, probability that a given word will be deleted. pos (str or Set[str]): Part of speech tag(s) of words to be considered for augmentation. If None, all words are considered. Returns: List[:class:`AugTok`]: New, augmented sequence of tokens. """ _validate_aug_toks(aug_toks) pos = utils.to_collection(pos, str, set) # bail out on very short sentences to avoid clobbering meaning if len(aug_toks) < 3: return aug_toks[:] cand_idxs = [ idx for idx, aug_tok in enumerate(aug_toks) if aug_tok.is_word and (pos is None or aug_tok.pos in pos) ] rand_idxs = set(_select_random_candidates(cand_idxs, num)) if not rand_idxs: return aug_toks[:] new_aug_toks = [] padded_triplets = itertoolz.sliding_window(3, [None] + aug_toks + [None]) for idx, (prev_tok, curr_tok, next_tok) in enumerate(padded_triplets): if idx in rand_idxs: # special case: word then [deleted word] then punctuation # give deleted word's whitespace to previous word if prev_tok and next_tok and prev_tok.is_word and not next_tok.is_word: new_aug_toks[-1] = aug_utils.AugTok( text=prev_tok.text, ws=curr_tok.ws, pos=prev_tok.pos, is_word=prev_tok.is_word, syns=prev_tok.syns, ) else: new_aug_toks.append(curr_tok) return new_aug_toks
def build_units(dimensions: Iterable[int], activation: Optional[torch.nn.Module]) -> List[torch.nn.Module]: """ Given a list of dimensions and optional activation, return a list of units where each unit is a linear layer followed by an activation layer. :param dimensions: iterable of dimensions for the chain :param activation: activation layer to use e.g. nn.ReLU, set to None to disable :return: list of instances of Sequential """ def single_unit(in_dimension: int, out_dimension: int) -> torch.nn.Module: unit = [('linear', nn.Linear(in_dimension, out_dimension))] if activation is not None: unit.append(('activation', activation)) return nn.Sequential(OrderedDict(unit)) return [ single_unit(embedding_dimension, hidden_dimension) for embedding_dimension, hidden_dimension in sliding_window(2, dimensions) ]
def _get_per_word_occurrence_values(doc, normalize, stop_words, window_size): """ Get base values for each individual occurrence of a word, to be aggregated and combined into a per-word score. Args: doc (:class:`spacy.tokens.Doc`) normalize (str) stop_words (Set[str]) window_size (int) Returns: Dict[int, Dict[str, list]] """ word_occ_vals = collections.defaultdict( lambda: collections.defaultdict(list)) def _is_upper_cased(tok): return tok.is_upper or (tok.is_title and not tok.is_sent_start) attr_name = _get_attr_name(normalize, False) padding = [None] * window_size for sent_idx, sent in enumerate(doc.sents): sent_padded = itertoolz.concatv(padding, sent, padding) for window in itertoolz.sliding_window(1 + (2 * window_size), sent_padded): lwords, word, rwords = window[:window_size], window[ window_size], window[window_size + 1:] w_id = getattr(word, attr_name) if word.is_stop: stop_words.add(w_id) word_occ_vals[w_id]["is_uc"].append(_is_upper_cased(word)) word_occ_vals[w_id]["sent_idx"].append(sent_idx) word_occ_vals[w_id]["l_context"].extend( getattr(w, attr_name) for w in lwords if not (w is None or w.is_punct or w.is_space)) word_occ_vals[w_id]["r_context"].extend( getattr(w, attr_name) for w in rwords if not (w is None or w.is_punct or w.is_space)) return word_occ_vals
def get_ngram_candidates( doc: Doc, ns: int | Collection[int], *, include_pos: Optional[str | Collection[str]] = ("NOUN", "PROPN", "ADJ"), ) -> Iterable[Tuple[Token, ...]]: """ Get candidate keyterms from ``doc``, where candidates are n-length sequences of tokens (for all n in ``ns``) that don't start/end with a stop word or contain punctuation tokens, and whose constituent tokens are filtered by POS tag. Args: doc ns: One or more n values for which to generate n-grams. For example, ``2`` gets bigrams; ``(2, 3)`` gets bigrams and trigrams. include_pos: One or more POS tags with which to filter ngrams. If None, include tokens of all POS tags. Yields: Next ngram candidate, as a tuple of constituent Tokens. See Also: :func:`textacy.extract.ngrams()` """ ns = utils.to_collection(ns, int, tuple) include_pos = utils.to_collection(include_pos, str, set) ngrams = itertoolz.concat(itertoolz.sliding_window(n, doc) for n in ns) ngrams = ( ngram for ngram in ngrams if not (ngram[0].is_stop or ngram[-1].is_stop) and not any(word.is_punct or word.is_space for word in ngram) ) if include_pos: ngrams = ( ngram for ngram in ngrams if all(word.pos_ in include_pos for word in ngram) ) for ngram in ngrams: yield ngram
def create_ww_matrix(vocab_list, vocab_index_dict, tokens, encoding): # no function call overhead - twice as fast window_type = encoding['window_type'] window_size = encoding['window_size'] window_weight = encoding['window_weight'] # count num_vocab = len(vocab_list) count_matrix = np.zeros([num_vocab, num_vocab]) if VERBOSE: print('\nCounting word-word co-occurrences in {}-word moving window'.format(window_size)) for i in range(window_size): tokens.append(PAD) windows = itertoolz.sliding_window(window_size + 1, tokens) # + 1 because window consists of t2s only for window in windows: # print(window) if window[0] in vocab_index_dict: for i in range(window_size): if window[i+1] in vocab_index_dict: dist = 1/(i+1) if window_weight == "linear": count_matrix[vocab_index_dict[window[0]], vocab_index_dict[window[i+1]]] += dist elif window_weight == "flat": count_matrix[vocab_index_dict[window[0]], vocab_index_dict[window[i+1]]] += 1 # window_type if window_type == 'forward': final_matrix = count_matrix elif window_type == 'backward': final_matrix = count_matrix.transpose() elif window_type == 'summed': final_matrix = count_matrix + count_matrix.transpose() elif window_type == 'concatenated': final_matrix = np.concatenate((count_matrix, count_matrix.transpose()), axis=1) else: raise AttributeError('Invalid arg to "window_type".') # print('Shape of normalized matrix={}'.format(final_matrix.shape)) return final_matrix
def insert_word_synonyms(aug_toks, *, num=1, pos=None): """ Randomly insert random synonyms of tokens for which synonyms are available, up to ``num`` times or with a probability of ``num``. Args: aug_toks (List[:obj:`AugTok`]): Sequence of tokens to augment through synonym insertion. num (int or float): If int, maximum number of words with available synonyms from which a random synonym is selected and randomly inserted; if float, probability that a given word with synonyms will provide a synonym to be inserted. pos (str or Set[str]): Part of speech tag(s) of words to be considered for augmentation. If None, all words with synonyms are considered. Returns: List[:obj:`AugTok`]: New, augmented sequence of tokens. Note: This transform requires :class:`textacy.resources.ConceptNet` to be downloaded to work properly, since this is the data source for word synonyms to be inserted. """ _validate_aug_toks(aug_toks) pos = utils.to_collection(pos, str, set) # bail out on very short sentences to avoid clobbering meaning if len(aug_toks) < 3: return aug_toks[:] cand_aug_toks = [ aug_tok for aug_tok in aug_toks if aug_tok.syns and (pos is None or aug_tok.pos in pos) ] rand_aug_toks = _select_random_candidates(cand_aug_toks, num) rand_idxs = random.sample(range(len(aug_toks)), len(rand_aug_toks)) if not rand_idxs: return aug_toks[:] rand_aug_toks = iter(rand_aug_toks) new_aug_toks = [] for idx, (prev_tok, curr_tok) in enumerate( itertoolz.sliding_window(2, [None] + aug_toks)): if idx in rand_idxs: rand_aug_tok = next(rand_aug_toks) if prev_tok: # use previous token's whitespace for inserted synonym new_tok_ws = prev_tok.ws if prev_tok.is_word and not prev_tok.ws: # previous token should have whitespace, if a word new_aug_toks[-1] = aug_utils.AugTok( text=prev_tok.text, ws=" ", pos=prev_tok.pos, is_word=True, syns=prev_tok.syns, ) else: new_tok_ws = " " new_aug_toks.append( aug_utils.AugTok( text=random.choice(rand_aug_tok.syns), ws=new_tok_ws, pos=rand_aug_tok.pos, is_word=rand_aug_tok.is_word, syns=rand_aug_tok.syns, # TODO: re-fetch syns? use []? )) new_aug_toks.append(curr_tok) return new_aug_toks
def build_cooccurrence_network( data: Sequence[str] | Sequence[Sequence[str]], *, window_size: int = 2, edge_weighting: str = "count", # Literal["count", "binary"] ) -> nx.Graph: """ Transform an ordered sequence of strings (or a sequence of such sequences) into a graph, where each string is represented by a node with weighted edges linking it to other strings that co-occur within ``window_size`` elements of itself. Input ``data`` can take a variety of forms. For example, as a ``Sequence[str]`` where elements are token or term strings from a single document: .. code-block:: pycon >>> texts = [ ... "Mary had a little lamb. Its fleece was white as snow.", ... "Everywhere that Mary went the lamb was sure to go.", ... ] >>> docs = [make_spacy_doc(text, lang="en_core_web_sm") for text in texts] >>> data = [tok.text for tok in docs[0]] >>> graph = build_cooccurrence_network(data, window_size=2) >>> sorted(graph.adjacency())[0] ('.', {'lamb': {'weight': 1}, 'Its': {'weight': 1}, 'snow': {'weight': 1}}) Or as a ``Sequence[Sequence[str]]``, where elements are token or term strings per sentence from a single document: .. code-block:: pycon >>> data = [[tok.text for tok in sent] for sent in docs[0].sents] >>> graph = build_cooccurrence_network(data, window_size=2) >>> sorted(graph.adjacency())[0] ('.', {'lamb': {'weight': 1}, 'snow': {'weight': 1}}) Or as a ``Sequence[Sequence[str]]``, where elements are token or term strings per document from multiple documents: .. code-block:: pycon >>> data = [[tok.text for tok in doc] for doc in docs] >>> graph = build_cooccurrence_network(data, window_size=2) >>> sorted(graph.adjacency())[0] ('.', {'lamb': {'weight': 1}, 'Its': {'weight': 1}, 'snow': {'weight': 1}, 'go': {'weight': 1}}) Note how the "." token's connections to other nodes change for each case. (Note that in real usage, you'll probably want to remove stopwords, punctuation, etc. so that nodes in the graph represent meaningful concepts.) Args: data window_size: Size of sliding window over ``data`` that determines which strings are said to co-occur. For example, a value of 2 means that only immediately adjacent strings will have edges in the network; larger values loosen the definition of co-occurrence and typically lead to a more densely-connected network. .. note:: Co-occurrence windows are not permitted to cross sequences. So, if ``data`` is a ``Sequence[Sequence[str]]``, then co-occ counts are computed separately for each sub-sequence, then summed together. edge_weighting: Method by which edges between nodes are weighted. If "count", nodes are connected by edges with weights equal to the number of times they co-occurred within a sliding window; if "binary", all such edges have weight set equal to 1. Returns: Graph whose nodes correspond to individual strings from ``data``; those that co-occur are connected by edges with weights determined by ``edge_weighting``. Reference: https://en.wikipedia.org/wiki/Co-occurrence_network """ if not data: LOGGER.warning("input `data` is empty, so output graph is also empty") return nx.Graph() if window_size < 2: raise ValueError( f"window_size = {window_size} is invalid; value must be >= 2") # input data is Sequence[str] if isinstance(data[0], str): windows = itertoolz.sliding_window(min(window_size, len(data)), data) # input data is Sequence[Sequence[str]] elif isinstance(data[0], Sequence) and isinstance(data[0][0], str): windows = itertoolz.concat( itertoolz.sliding_window(min(window_size, len(subseq)), subseq) for subseq in data) else: raise TypeError( errors.type_invalid_msg( "data", data, Union[Sequence[str], Sequence[Sequence[str]]])) graph = nx.Graph() if edge_weighting == "count": cooc_counts = collections.Counter( w1_w2 for window in windows for w1_w2 in itertools.combinations(sorted(window), 2)) graph.add_edges_from((w1, w2, { "weight": weight }) for (w1, w2), weight in cooc_counts.items()) elif edge_weighting == "binary": edge_data = {"weight": 1} graph.add_edges_from((w1, w2, edge_data) for window in windows for (w1, w2) in itertools.combinations(window, 2)) else: raise ValueError( errors.value_invalid_msg("edge_weighting", edge_weighting, {"count", "binary"})) return graph
def co_occurence_matrix(target_index_dict, age_index_dict, childesdb_data): print('Creating co-occurrence matrices') window_type = 'forward' # forward, backward, summed, concatenated window_size = 7 window_weight = 'flat' # linear or flat PAD = '*PAD*' # The goal is to create a 3 dimensional array of the following x,y,z dimensions: MCDI words X MCDI words X Age num_targets = len(target_index_dict) num_ages = len(age_index_dict) cooc_matrix_by_age_list = [] cumulative_cooc_matrix_by_age_list = [] corpus_by_age_list = [] for i in range(num_ages): corpus_by_age_list.append([]) # Then specify what items (words) will be updating the correct row and columns. for i in range(len(childesdb_data)): utterance = childesdb_data[i][3] age = childesdb_data[i][2] age_index = age_index_dict[age] corpus_by_age_list[age_index] += utterance # now we are ready to start counting co-occurrences for each age for i in range(num_ages): cooc_matrix = np.zeros([num_targets, num_targets], float) cumulative_cooc_matrix = np.zeros([num_targets, num_targets], float) current_corpus = corpus_by_age_list[i] if len(current_corpus) > 0: current_corpus += [ PAD ] * window_size # add pad such that all co-occurrences in last window are captured windows = itertoolz.sliding_window(window_size, current_corpus) for w in windows: for word1, word2, dist in zip([w[0]] * (window_size - 1), w[1:], range(1, window_size)): # increment if word1 == PAD or word2 == PAD: continue if word1 not in target_index_dict: continue if word2 not in target_index_dict: continue word1_index = target_index_dict[word1] word2_index = target_index_dict[word2] if window_weight == "linear": cooc_matrix[word1_index, word2_index] += window_size - dist elif window_weight == "flat": cooc_matrix[word1_index, word2_index] += 1 # window_type if window_type == 'forward': final_matrix = cooc_matrix elif window_type == 'backward': final_matrix = cooc_matrix.transpose() elif window_type == 'summed': final_matrix = cooc_matrix + cooc_matrix.transpose() elif window_type == 'concatenate': final_matrix = np.concatenate( (cooc_matrix, cooc_matrix.transpose())) else: raise AttributeError('Invalid arg to "window_type".') cooc_matrix_by_age_list.append(final_matrix) current_cumul_cooc_matrix = np.zeros([num_targets, num_targets], float) for i in range(num_ages): current_cooc_matrix = cooc_matrix_by_age_list[i] current_cumul_cooc_matrix += current_cooc_matrix cumulative_cooc_matrix_by_age_list.append( current_cumul_cooc_matrix.copy()) return cooc_matrix_by_age_list, cumulative_cooc_matrix_by_age_list
def terms_to_semantic_network( terms: Union[Sequence[str], Sequence[Token]], *, normalize: Union[str, bool, Callable[[Token], str]] = "lemma", window_width: int = 10, edge_weighting: str = "cooc_freq", ) -> nx.Graph: """ Transform an ordered list of non-overlapping terms into a semantic network, where each term is represented by a node with weighted edges linking it to other terms that co-occur within ``window_width`` terms of itself. Args: terms normalize: If "lemma", lemmatize terms; if "lower", lowercase terms; if falsy, use the form of terms as they appear in ``terms``; if a callable, must accept a ``Token`` and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`. .. note:: This is applied to the elements of ``terms`` *only* if it's a list of ``Token``. window_width: Size of sliding window over ``terms`` that determines which are said to co-occur. If 2, only immediately adjacent terms have edges in the returned network. edge_weighting ({'cooc_freq', 'binary'}): If 'cooc_freq', the nodes for all co-occurring terms are connected by edges with weight equal to the number of times they co-occurred within a sliding window; if 'binary', all such edges have weight = 1. Returns: Networkx graph whose nodes represent individual terms, connected by edges based on term co-occurrence with weights determined by ``edge_weighting``. Note: - Be sure to filter out stopwords, punctuation, certain parts of speech, etc. from the terms list before passing it to this function - Multi-word terms, such as named entities and compound nouns, must be merged into single strings or ``Token`` s beforehand - If terms are already strings, be sure to have normalized them so that like terms are counted together; for example, by applying :func:`textacy.spacier.utils.get_normalized_text()` """ if window_width < 2: raise ValueError( "`window_width` = {} is invalid; value must be >= 2".format( window_width)) if not terms: LOGGER.warning("input `terms` is empty, so output graph is also empty") return nx.Graph() # if len(terms) < window_width, cytoolz throws a StopIteration error # which we don't want if len(terms) < window_width: LOGGER.info( "`terms` has fewer items (%s) than the specified `window_width` (%s); " "setting window width to %s", len(terms), window_width, len(terms), ) window_width = len(terms) if isinstance(terms[0], str): windows = itertoolz.sliding_window(window_width, terms) elif isinstance(terms[0], Token): if normalize == "lemma": windows = ( (tok.lemma_ for tok in window) for window in itertoolz.sliding_window(window_width, terms)) elif normalize == "lower": windows = ( (tok.lower_ for tok in window) for window in itertoolz.sliding_window(window_width, terms)) elif not normalize: windows = ( (tok.text for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: windows = ( (normalize(tok) for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: raise TypeError( "items in `terms` must be strings or spacy tokens, not {}".format( type(terms[0]))) graph = nx.Graph() if edge_weighting == "cooc_freq": cooc_mat: DefaultDict[str, DefaultDict[str, int]] cooc_mat = collections.defaultdict( lambda: collections.defaultdict(int)) for window in windows: for w1, w2 in itertools.combinations(sorted(window), 2): cooc_mat[w1][w2] += 1 graph.add_edges_from((w1, w2, { "weight": weight }) for w1, w2s in cooc_mat.items() for w2, weight in w2s.items()) elif edge_weighting == "binary": graph.add_edges_from(w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) return graph
def test_sliding_window_of_short_iterator(): assert list(sliding_window(3, [1, 2])) == []
def scake( doc: Doc, *, normalize: Optional[Union[str, Callable[[Token], str]]] = "lemma", include_pos: Optional[Union[str, Collection[str]]] = ("NOUN", "PROPN", "ADJ"), topn: Union[int, float] = 10, ) -> List[Tuple[str, float]]: """ Extract key terms from a document using the sCAKE algorithm. Args: doc: spaCy ``Doc`` from which to extract keyterms. Must be sentence-segmented; optionally POS-tagged. normalize: If "lemma", lemmatize terms; if "lower", lowercase terms; if None, use the form of terms as they appeared in ``doc``; if a callable, must accept a ``Token`` and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`. include_pos: One or more POS tags with which to filter for good candidate keyterms. If None, include tokens of all POS tags (which also allows keyterm extraction from docs without POS-tagging.) topn: Number of top-ranked terms to return as key terms. If an integer, represents the absolute number; if a float, value must be in the interval (0.0, 1.0], which is converted to an int by ``int(round(len(candidates) * topn))`` Returns: Sorted list of top ``topn`` key terms and their corresponding scores. References: Duari, Swagata & Bhatnagar, Vasudha. (2018). sCAKE: Semantic Connectivity Aware Keyword Extraction. Information Sciences. 477. https://arxiv.org/abs/1811.10831v1 """ # validate / transform args include_pos = cast(Set[str], utils.to_collection(include_pos, str, set)) if isinstance(topn, float): if not 0.0 < topn <= 1.0: raise ValueError( "topn={} is invalid; " "must be an int, or a float between 0.0 and 1.0".format(topn)) # bail out on empty docs if not doc: return [] # build up a graph of good words, edges weighting by adjacent sentence co-occurrence cooc_mat: Counter[Tuple[str, str]] = collections.Counter() # handle edge case where doc only has 1 sentence n_sents = itertoolz.count(doc.sents) for window_sents in itertoolz.sliding_window(min(2, n_sents), doc.sents): if n_sents == 1: window_sents = (window_sents[0], []) window_words: Iterable[str] = ( word for word in itertoolz.concat(window_sents) if not (word.is_stop or word.is_punct or word.is_space) and ( not include_pos or word.pos_ in include_pos)) window_words = ke_utils.normalize_terms(window_words, normalize) cooc_mat.update( w1_w2 for w1_w2 in itertools.combinations(sorted(window_words), 2) if w1_w2[0] != w1_w2[1]) # doc doesn't have any valid words... if not cooc_mat: return [] graph = nx.Graph() graph.add_edges_from((w1, w2, { "weight": weight }) for (w1, w2), weight in cooc_mat.items()) word_scores = _compute_word_scores(doc, graph, cooc_mat, normalize) if not word_scores: return [] # generate a list of candidate terms candidates = _get_candidates(doc, normalize, include_pos) if isinstance(topn, float): topn = int(round(len(set(candidates)) * topn)) # rank candidates by aggregating constituent word scores candidate_scores = { " ".join(candidate): sum(word_scores.get(word, 0.0) for word in candidate) for candidate in candidates } sorted_candidate_scores = sorted(candidate_scores.items(), key=operator.itemgetter(1, 0), reverse=True) return ke_utils.get_filtered_topn_terms(sorted_candidate_scores, topn, match_threshold=0.8)
def test_sliding_window(): assert list(sliding_window(2, [1, 2, 3, 4])) == [(1, 2), (2, 3), (3, 4)] assert list(sliding_window(3, [1, 2, 3, 4])) == [(1, 2, 3), (2, 3, 4)]
def get_sliding_windows( window_size: int, tokens: List[str], ) -> List[List[str]]: res = list(itertoolz.sliding_window(window_size, tokens)) return res
def get_sliding_windows(window_size, tokens): res = list(itertoolz.sliding_window(window_size, tokens)) return res
def build_graph_from_terms(terms, *, normalize="lemma", window_size=10, edge_weighting="count"): """ Transform an ordered list of non-overlapping terms into a graph, where each term is represented by a node with weighted edges linking it to other terms that co-occur within ``window_size`` terms of itself. Args: terms (List[str] or List[:class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`]) normalize (str or Callable): If "lemma", lemmatize terms; if "lower", lowercase terms; if falsy, use the form of terms as they appear in ``terms``; if a callable, must accept a ``Token`` and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`. .. note:: This is applied to the elements of ``terms`` *only* if it's a list of ``Token`` or ``Span``. window_size (int): Size of sliding window over ``terms`` that determines which are said to co-occur. If 2, only immediately adjacent terms have edges in the returned network. edge_weighting ({"count", "binary"}): If "count", the nodes for all co-occurring terms are connected by edges with weight equal to the number of times they co-occurred within a sliding window; if "binary", all such edges have weight = 1. Returns: :class:`networkx.Graph`: Nodes in this network correspond to individual terms; those that co-occur are connected by edges with weights determined by ``edge_weighting``. """ if window_size < 2: raise ValueError( "window_size = {} is invalid; value must be >= 2".format( window_size)) if not terms: LOGGER.warning("input `terms` is empty, so output graph is also empty") return nx.Graph() # if len(terms) < window_size, cytoolz throws a StopIteration error; prevent it if len(terms) < window_size: LOGGER.info( "`terms` has fewer items (%s) than `window_size` (%s); " "setting window width to %s", len(terms), window_size, len(terms), ) window_size = len(terms) first_term, terms = itertoolz.peek(terms) if isinstance(first_term, str): windows = itertoolz.sliding_window(window_size, terms) elif isinstance(first_term, (Span, Token)): windows = itertoolz.sliding_window( window_size, ke_utils.normalize_terms(terms, normalize)) else: raise TypeError( "items in `terms` must be strings or spacy tokens, not {}".format( type(first_term))) graph = nx.Graph() if edge_weighting == "count": cooc_mat = collections.Counter( w1_w2 for window in windows for w1_w2 in itertools.combinations(sorted(window), 2)) graph.add_edges_from((w1, w2, { "weight": weight }) for (w1, w2), weight in cooc_mat.items()) elif edge_weighting == "binary": graph.add_edges_from(w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) else: raise ValueError( "edge_weighting = {} is invalid; must be one of {}".format( edge_weighting, {"count", "binary"})) return graph
def terms_to_semantic_network(terms, normalize='lemma', window_width=10, edge_weighting='cooc_freq'): """ Convert an ordered list of non-overlapping terms into a semantic network, where each term is represented by a node with edges linking it to other terms that co-occur within ``window_width`` terms of itself. Args: terms (List[str] or List[``spacy.Token``]) normalize (str or callable): if 'lemma', lemmatize terms; if 'lower', lowercase terms; if false-y, use the form of terms as they appear in doc; if a callable, must accept a ``spacy.Token`` and return a str, e.g. :func:`textacy.spacy_utils.normalized_str()`; only applicable if ``terms`` is a List[``spacy.Token``] window_width (int, optional): size of sliding window over `terms` that determines which are said to co-occur; if = 2, only adjacent terms will have edges in network edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary', all co-occurring terms will have network edges with weight = 1; if 'cooc_freq', edges will have a weight equal to the number of times that the connected nodes co-occur in a sliding window Returns: ``networkx.Graph``: Nodes are terms, edges are for co-occurrences of terms. Notes: - Be sure to filter out stopwords, punctuation, certain parts of speech, etc. from the terms list before passing it to this function - Multi-word terms, such as named entities and compound nouns, must be merged into single strings or spacy.Tokens beforehand - If terms are already strings, be sure to have normalized them so that like terms are counted together; for example, by applying :func:`normalized_str() <textacy.spacy_utils.normalized_str>` """ if window_width < 2: raise ValueError('Window width must be >= 2') if not terms: raise ValueError( '`terms` list is empty; it must contain 1 or more terms') # if len(terms) < window_width, cytoolz throws a StopIteration error # which we don't want if len(terms) < window_width: LOGGER.warning( 'input terms list is smaller than window width (%s < %s)', len(terms), window_width) window_width = len(terms) if isinstance(terms[0], compat.unicode_): windows = itertoolz.sliding_window(window_width, terms) elif isinstance(terms[0], SpacyToken): if normalize == 'lemma': windows = ( (tok.lemma_ for tok in window) for window in itertoolz.sliding_window(window_width, terms)) elif normalize == 'lower': windows = ( (tok.lower_ for tok in window) for window in itertoolz.sliding_window(window_width, terms)) elif not normalize: windows = ( (tok.text for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: windows = ( (normalize(tok) for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: msg = 'Input terms must be strings or spacy Tokens, not {}.'.format( type(terms[0])) raise TypeError(msg) graph = nx.Graph() if edge_weighting == 'cooc_freq': cooc_mat = collections.defaultdict( lambda: collections.defaultdict(int)) for window in windows: for w1, w2 in itertools.combinations(sorted(window), 2): cooc_mat[w1][w2] += 1 graph.add_edges_from((w1, w2, { 'weight': cooc_mat[w1][w2] }) for w1, w2s in cooc_mat.items() for w2 in w2s) elif edge_weighting == 'binary': graph.add_edges_from(w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) return graph
def make_sparse_ww_matrix(docs: Generator[List[str], None, None], w2id: Dict[str, int], window_size: int, window_type: str, window_weight: str, max_num_docs: Optional[int] = None, pad='*PAD*', ) -> sparse.coo_matrix: print('Counting word-word co-occurrences in {}-word moving window'.format(window_size)) # init lists for sparse matrix construction rows = [] cols = [] data = [] if max_num_docs is not None: docs = islice(docs, max_num_docs) print(len(w2id)) for tokens in docs: # pad tokens such that all co-occurrences in last window are captured padding = (pad for _ in range(window_size)) tokens_padded = chain(tokens, padding) # + 1 because window consists of w2s only for window in itertoolz.sliding_window(window_size + 1, tokens_padded): for w1, w2, dist in zip([window[0]] * window_size, window[1:], range(window_size)): if w1 in w2id and w2 in w2id: w1_id = w2id[w1] w2_id = w2id[w2] rows.append(w1_id) cols.append(w2_id) # increment if w1_id == pad or w2_id == pad: continue if window_weight == "linear": data.append(window_size - dist) elif window_weight == "flat": data.append(1) matrix = sparse.coo_matrix((np.array(data, dtype=np.int32), (rows, cols))) # window_type if window_type == 'forward': matrix = matrix elif window_type == 'backward': matrix = matrix.transpose() elif window_type == 'summed': matrix = matrix + matrix.transpose() elif window_type == 'concatenated': matrix = np.concatenate((matrix, matrix.transpose()), axis=1) else: raise AttributeError('Invalid arg to "window_type".') print('Shape of matrix={}'.format(matrix.shape)) return matrix
def terms_to_semantic_network(terms, normalize='lemma', window_width=10, edge_weighting='cooc_freq'): """ Transform an ordered list of non-overlapping terms into a semantic network, where each term is represented by a node with weighted edges linking it to other terms that co-occur within ``window_width`` terms of itself. Args: terms (List[str] or List[``spacy.Token``]) normalize (str or Callable): If 'lemma', lemmatize terms; if 'lower', lowercase terms; if false-y, use the form of terms as they appear in ``terms``; if a callable, must accept a ``spacy.Token`` and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`. .. note:: This is applied to the elements of ``terms`` *only* if it's a list of ``spacy.Token``. window_width (int): Size of sliding window over ``terms`` that determines which are said to co-occur. If 2, only immediately adjacent terms have edges in the returned network. edge_weighting ({'cooc_freq', 'binary'}): If 'cooc_freq', the nodes for all co-occurring terms are connected by edges with weight equal to the number of times they co-occurred within a sliding window; if 'binary', all such edges have weight = 1. Returns: ``networkx.Graph``: Nodes in this network correspond to individual terms; those that co-occur are connected by edges with weights determined by ``edge_weighting``. Notes: - Be sure to filter out stopwords, punctuation, certain parts of speech, etc. from the terms list before passing it to this function - Multi-word terms, such as named entities and compound nouns, must be merged into single strings or spacy.Tokens beforehand - If terms are already strings, be sure to have normalized them so that like terms are counted together; for example, by applying :func:`textacy.spacier.utils.get_normalized_text()` """ if window_width < 2: raise ValueError( '`window_width` = {} is invalid; value must be >= 2'.format(window_width)) if not terms: raise ValueError( '`terms` = {} is invalid; it must contain at least 1 term ' 'in the form of a string or spacy token'.format(terms)) # if len(terms) < window_width, cytoolz throws a StopIteration error # which we don't want if len(terms) < window_width: LOGGER.info( '`terms` has fewer items (%s) than the specified `window_width` (%s); ' 'setting window width to %s', len(terms), window_width, len(terms)) window_width = len(terms) if isinstance(terms[0], compat.unicode_): windows = itertoolz.sliding_window(window_width, terms) elif isinstance(terms[0], SpacyToken): if normalize == 'lemma': windows = ((tok.lemma_ for tok in window) for window in itertoolz.sliding_window(window_width, terms)) elif normalize == 'lower': windows = ((tok.lower_ for tok in window) for window in itertoolz.sliding_window(window_width, terms)) elif not normalize: windows = ((tok.text for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: windows = ((normalize(tok) for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: raise TypeError( 'items in `terms` must be strings or spacy tokens, not {}'.format(type(terms[0]))) graph = nx.Graph() if edge_weighting == 'cooc_freq': cooc_mat = collections.defaultdict(lambda: collections.defaultdict(int)) for window in windows: for w1, w2 in itertools.combinations(sorted(window), 2): cooc_mat[w1][w2] += 1 graph.add_edges_from( (w1, w2, {'weight': weight}) for w1, w2s in cooc_mat.items() for w2, weight in w2s.items()) elif edge_weighting == 'binary': graph.add_edges_from( w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) return graph