from childesngrams import configs from childesngrams.io import load_tokens from childesngrams.utils import get_sliding_windows CORPUS_NAME = 'childes-20201026' MAX_NGRAM_SIZE = 7 tokens = load_tokens(CORPUS_NAME) tokens1 = tokens[:len(tokens) // 2] tokens2 = tokens[-len(tokens) // 2:] ngram_sizes = list(range(1, MAX_NGRAM_SIZE + 1)) part_id2y = {1: [], 2: []} for ngram_size in ngram_sizes: # make n-grams ngrams1 = get_sliding_windows(ngram_size, tokens1) ngrams2 = get_sliding_windows(ngram_size, tokens2) num_ngrams1 = len(ngrams1) num_ngrams2 = len(ngrams2) # get unique n-grams unique_ngrams1 = set(ngrams1) unique_ngrams2 = set(ngrams2) # get percentage of unique n-grams in one part also in other part intersection = len(unique_ngrams2.intersection(unique_ngrams1)) yi1 = intersection / len(unique_ngrams1) yi2 = intersection / len(unique_ngrams2) # collect part_id2y[1].append(yi1)
from collections import Counter import numpy as np from childesngrams import configs from childesngrams.io import load_tokens from childesngrams.utils import get_sliding_windows # ///////////////////////////////////////////////////////////////// CORPUS_NAME = 'childes-20201026' tokens = load_tokens(CORPUS_NAME) ngram2f = Counter(get_sliding_windows(2, tokens)) unique_fs, counts = np.unique([v for v in ngram2f.values()], return_counts=True) num_total = counts.sum() print(len(unique_fs), len(counts), num_total) print(unique_fs) p = configs.Dirs.bi_grams / 'bi-grams.txt' with p.open('w') as f: for ng, freq in sorted(ngram2f.items(), key=lambda i: i[1], reverse=True): idx = np.where(unique_fs <= freq)[0] num_words_less_frequent = counts[idx].sum() percentile = num_words_less_frequent / num_total * 100 f.write(f'{freq} {ng[0]} {ng[1]} {percentile}\n')
trajectory.append(np.nan) pbar.update() # res ns = np.where(np.array(trajectory) == 1)[0] hist, b = np.histogram(ns, bins=NUM_BINS, range=[0, num_ngrams]) res = (b[:-1], hist) return res # size2novel_xys num_ngram_sizes = len(NGRAM_SIZES) size2novel_xys1 = {} size2novel_xys2 = {} for ngram_size in NGRAM_SIZES: ngram_range = (ngram_size, ngram_size) ngrams = get_sliding_windows(ngram_size, tokens) xys1 = make_novel_xys(ngrams) xys2 = make_novel_xys(ngrams[::-1]) size2novel_xys1[ngram_size] = xys1 size2novel_xys2[ngram_size] = xys2 # fig fig, axs = plt.subplots(num_ngram_sizes, 1, sharex='all', dpi=configs.Fig.dpi, figsize=None) if num_ngram_sizes == 1: axs = [axs] for ax, ngram_size in zip(axs, NGRAM_SIZES): if ax == axs[-1]: ax.tick_params(axis='both', which='both', top=False, right=False) ax.set_ylabel('Corpus Location', fontsize=configs.Fig.ax_fontsize) else: ax.tick_params(axis='both', which='both', top=False, right=False, bottom='off')