Esempio n. 1
0
def perform_min_analysis(texts):
    # pass a list of strs, one for each text
    get_new_tokenized = lambda: ct.tokenize(
        texts, lemma=False
    )  # necessary because tokenize returns a generator, and then calling frequency or collocator or whatever will run it to StopIteration, so you have to get a new one each time
    freq = ct.frequency(get_new_tokenized())
    # ct.head(freq, hits=10)

    words_of_interest = ["เอา"]
    # get all ngrams
    n = 2
    ngrams = ct.tokenize(texts, lemma=False, ngram=n)
    ngram_freq = ct.frequency(ngrams)

    # subset of these ngrams where word of interest is present somewhere
    keys_containing_words_of_interest = [
        k for k in ngram_freq.keys() if any(w in k for w in words_of_interest)
    ]
    freqs_of_interest_anywhere = {
        k: ngram_freq[k]
        for k in keys_containing_words_of_interest
    }
    print("\n---- ngrams containing words of interest anywhere ----")
    ct.head(freqs_of_interest_anywhere, hits=10)

    # subset of these ngrams where word of interest is at the beginning
    keys_beginning_with_words_of_interest = [
        k for k in ngram_freq.keys()
        if any(k[:len(w)] == w for w in words_of_interest)
    ]
    freqs_of_interest_beginning = {
        k: ngram_freq[k]
        for k in keys_beginning_with_words_of_interest
    }
    print("\n---- ngrams containing words of interest at beginning ----")
    ct.head(freqs_of_interest_beginning, hits=10)

    # subset of these ngrams where word of interest is at the end
    keys_ending_with_words_of_interest = [
        k for k in ngram_freq.keys()
        if any(k[-len(w):] == w for w in words_of_interest)
    ]
    freqs_of_interest_ending = {
        k: ngram_freq[k]
        for k in keys_ending_with_words_of_interest
    }
    print("\n---- ngrams containing words of interest at end ----")
    ct.head(freqs_of_interest_ending, hits=10)

    # show collocations of most frequent words or words of interest
    # n_most_frequent_words = 5
    # collocation_words_tups = get_top_n_dict_items(freq, n_most_frequent_words)
    metrics = ["MI", "T", "freq", "right", "left"]
    # for word, _ in collocation_words_tups:
    for word in words_of_interest:
        for metric in metrics:
            collocates = ct.collocator(get_new_tokenized(), word, stat=metric)
            print("----\nCollocations for {} using stat={}:".format(
                word, metric))
            ct.head(collocates, hits=10)
Esempio n. 2
0
def compare_collocation(word):
    """ Calculate collocates for a particular word in each corpus and compare them.
        Here, we use the built-in tokenizer in corpus_toolkit (simple split by whitespace)
        to avoid removing special characters/punctuations that might be collocates.
    """
    corp1_collocates = ct.collocator(ct.tokenize(ct.ldcorpus(corpus1)),
                                     word,
                                     stat="MI")
    corp2_collocates = ct.collocator(ct.tokenize(ct.ldcorpus(corpus2)),
                                     word,
                                     stat="MI")
    print(f'\n--------Collocates for the word `{word}`: {corpus1}')
    ct.head(corp1_collocates, hits=20)
    print(f'\n--------Collocates for the word `{word}`: {corpus2}')
    ct.head(corp2_collocates, hits=20)
Esempio n. 3
0
def perform_kris_analysis(texts):
    # pass a list of strs, one for each text
    get_new_tokenized = lambda: ct.tokenize(
        texts, lemma=False
    )  # necessary because tokenize returns a generator, and then calling frequency or collocator or whatever will run it to StopIteration, so you have to get a new one each time
    freq = ct.frequency(get_new_tokenized())
    ct.head(freq, hits=10)

    collocation_words_tups = get_top_n_dict_items(freq, 5)
    for word, _ in collocation_words_tups:
        collocates = ct.collocator(get_new_tokenized(), word, stat="MI")
        print("----\nCollocations for {}:".format(word))
        ct.head(collocates, hits=10)

    # could do some keyness between different pairs of texts

    for ngram_n in [2, 3, 4]:
        tokenized_ngram = ct.tokenize(texts, lemma=False, ngram=ngram_n)
        ngram_freq = ct.frequency(tokenized_ngram)
        print("----\n{}-grams:".format(ngram_n))
        ct.head(ngram_freq, hits=10)
Esempio n. 4
0
def get_corpus_size_words(texts):
    tokenized = ct.tokenize(texts, lemma=False)
    return sum(len(x) for x in tokenized)
Esempio n. 5
0
# testing Kris's library
from corpus_toolkit import corpus_tools as ct
import KrisCorpusToolsDebugging as ct_debug
# documentation at https://github.com/kristopherkyle/corpus_toolkit

text1 = "cat cat dog cat elephant"
text2 = "the of and a to in is that it was"
text3 = "a a a a a aa a a a aa aaaaa"
texts = [text1, text2, text3]
tokenized = ct.tokenize(texts)
freq = ct.frequency(tokenized)
ct.head(freq, hits=5)
# frequency function iterates over "tokenized texts" first, then over ttokens in those texts, so input must be an iterable of iterables
# see source at https://github.com/kristopherkyle/corpus_toolkit/blob/b5f0eba13dee60a0b56a25c5f3f900fe7c8c8cb4/build/lib/corpus_toolkit/corpus_tools.py

# what if I include capitals and punctuation
print("----")
text4 = "Cat? Dog! A man, a plan, a canal, Panama! A dog, a panic in a pagoda. Most most most most most most most most most, most? Most! MOST... most?!?! most: Most most most most most most, (most) [most]."
texts.append(text4)
tokenized = ct.tokenize(texts)
freq = ct.frequency(tokenized)
ct.head(freq, hits=10)


print("----")
corpora_dir = "/home/wesley/Desktop/UOregon Work/CorpusLinguistics/corpora/"
text_files = ["dracula.txt", "wuthering_heights.txt"]
texts = []
for text_file in text_files:
    fp = corpora_dir + text_file
    with open(fp) as f: