コード例 #1
0
from childesngrams import configs
from childesngrams.io import load_tokens
from childesngrams.utils import get_sliding_windows

CORPUS_NAME = 'childes-20201026'
MAX_NGRAM_SIZE = 7

tokens = load_tokens(CORPUS_NAME)
tokens1 = tokens[:len(tokens) // 2]
tokens2 = tokens[-len(tokens) // 2:]

ngram_sizes = list(range(1, MAX_NGRAM_SIZE + 1))
part_id2y = {1: [], 2: []}
for ngram_size in ngram_sizes:
    # make n-grams
    ngrams1 = get_sliding_windows(ngram_size, tokens1)
    ngrams2 = get_sliding_windows(ngram_size, tokens2)
    num_ngrams1 = len(ngrams1)
    num_ngrams2 = len(ngrams2)

    # get unique n-grams
    unique_ngrams1 = set(ngrams1)
    unique_ngrams2 = set(ngrams2)

    # get percentage of unique n-grams in one part also in other part
    intersection = len(unique_ngrams2.intersection(unique_ngrams1))
    yi1 = intersection / len(unique_ngrams1)
    yi2 = intersection / len(unique_ngrams2)

    # collect
    part_id2y[1].append(yi1)
コード例 #2
0
from collections import Counter
import numpy as np

from childesngrams import configs
from childesngrams.io import load_tokens
from childesngrams.utils import get_sliding_windows

# /////////////////////////////////////////////////////////////////

CORPUS_NAME = 'childes-20201026'

tokens = load_tokens(CORPUS_NAME)

ngram2f = Counter(get_sliding_windows(2, tokens))
unique_fs, counts = np.unique([v for v in ngram2f.values()],
                              return_counts=True)
num_total = counts.sum()
print(len(unique_fs), len(counts), num_total)

print(unique_fs)
p = configs.Dirs.bi_grams / 'bi-grams.txt'
with p.open('w') as f:
    for ng, freq in sorted(ngram2f.items(), key=lambda i: i[1], reverse=True):

        idx = np.where(unique_fs <= freq)[0]
        num_words_less_frequent = counts[idx].sum()
        percentile = num_words_less_frequent / num_total * 100

        f.write(f'{freq} {ng[0]} {ng[1]} {percentile}\n')
コード例 #3
0
            trajectory.append(np.nan)
        pbar.update()
    # res
    ns = np.where(np.array(trajectory) == 1)[0]
    hist, b = np.histogram(ns, bins=NUM_BINS, range=[0, num_ngrams])
    res = (b[:-1], hist)
    return res


# size2novel_xys
num_ngram_sizes = len(NGRAM_SIZES)
size2novel_xys1 = {}
size2novel_xys2 = {}
for ngram_size in NGRAM_SIZES:
    ngram_range = (ngram_size, ngram_size)
    ngrams = get_sliding_windows(ngram_size, tokens)
    xys1 = make_novel_xys(ngrams)
    xys2 = make_novel_xys(ngrams[::-1])
    size2novel_xys1[ngram_size] = xys1
    size2novel_xys2[ngram_size] = xys2

# fig
fig, axs = plt.subplots(num_ngram_sizes, 1, sharex='all', dpi=configs.Fig.dpi, figsize=None)
if num_ngram_sizes == 1:
    axs = [axs]
for ax, ngram_size in zip(axs, NGRAM_SIZES):
    if ax == axs[-1]:
        ax.tick_params(axis='both', which='both', top=False, right=False)
        ax.set_ylabel('Corpus Location', fontsize=configs.Fig.ax_fontsize)
    else:
        ax.tick_params(axis='both', which='both', top=False, right=False, bottom='off')