def perform_min_analysis(texts): # pass a list of strs, one for each text get_new_tokenized = lambda: ct.tokenize( texts, lemma=False ) # necessary because tokenize returns a generator, and then calling frequency or collocator or whatever will run it to StopIteration, so you have to get a new one each time freq = ct.frequency(get_new_tokenized()) # ct.head(freq, hits=10) words_of_interest = ["เอา"] # get all ngrams n = 2 ngrams = ct.tokenize(texts, lemma=False, ngram=n) ngram_freq = ct.frequency(ngrams) # subset of these ngrams where word of interest is present somewhere keys_containing_words_of_interest = [ k for k in ngram_freq.keys() if any(w in k for w in words_of_interest) ] freqs_of_interest_anywhere = { k: ngram_freq[k] for k in keys_containing_words_of_interest } print("\n---- ngrams containing words of interest anywhere ----") ct.head(freqs_of_interest_anywhere, hits=10) # subset of these ngrams where word of interest is at the beginning keys_beginning_with_words_of_interest = [ k for k in ngram_freq.keys() if any(k[:len(w)] == w for w in words_of_interest) ] freqs_of_interest_beginning = { k: ngram_freq[k] for k in keys_beginning_with_words_of_interest } print("\n---- ngrams containing words of interest at beginning ----") ct.head(freqs_of_interest_beginning, hits=10) # subset of these ngrams where word of interest is at the end keys_ending_with_words_of_interest = [ k for k in ngram_freq.keys() if any(k[-len(w):] == w for w in words_of_interest) ] freqs_of_interest_ending = { k: ngram_freq[k] for k in keys_ending_with_words_of_interest } print("\n---- ngrams containing words of interest at end ----") ct.head(freqs_of_interest_ending, hits=10) # show collocations of most frequent words or words of interest # n_most_frequent_words = 5 # collocation_words_tups = get_top_n_dict_items(freq, n_most_frequent_words) metrics = ["MI", "T", "freq", "right", "left"] # for word, _ in collocation_words_tups: for word in words_of_interest: for metric in metrics: collocates = ct.collocator(get_new_tokenized(), word, stat=metric) print("----\nCollocations for {} using stat={}:".format( word, metric)) ct.head(collocates, hits=10)
def compare_collocation(word): """ Calculate collocates for a particular word in each corpus and compare them. Here, we use the built-in tokenizer in corpus_toolkit (simple split by whitespace) to avoid removing special characters/punctuations that might be collocates. """ corp1_collocates = ct.collocator(ct.tokenize(ct.ldcorpus(corpus1)), word, stat="MI") corp2_collocates = ct.collocator(ct.tokenize(ct.ldcorpus(corpus2)), word, stat="MI") print(f'\n--------Collocates for the word `{word}`: {corpus1}') ct.head(corp1_collocates, hits=20) print(f'\n--------Collocates for the word `{word}`: {corpus2}') ct.head(corp2_collocates, hits=20)
def perform_kris_analysis(texts): # pass a list of strs, one for each text get_new_tokenized = lambda: ct.tokenize( texts, lemma=False ) # necessary because tokenize returns a generator, and then calling frequency or collocator or whatever will run it to StopIteration, so you have to get a new one each time freq = ct.frequency(get_new_tokenized()) ct.head(freq, hits=10) collocation_words_tups = get_top_n_dict_items(freq, 5) for word, _ in collocation_words_tups: collocates = ct.collocator(get_new_tokenized(), word, stat="MI") print("----\nCollocations for {}:".format(word)) ct.head(collocates, hits=10) # could do some keyness between different pairs of texts for ngram_n in [2, 3, 4]: tokenized_ngram = ct.tokenize(texts, lemma=False, ngram=ngram_n) ngram_freq = ct.frequency(tokenized_ngram) print("----\n{}-grams:".format(ngram_n)) ct.head(ngram_freq, hits=10)
def get_corpus_size_words(texts): tokenized = ct.tokenize(texts, lemma=False) return sum(len(x) for x in tokenized)
# testing Kris's library from corpus_toolkit import corpus_tools as ct import KrisCorpusToolsDebugging as ct_debug # documentation at https://github.com/kristopherkyle/corpus_toolkit text1 = "cat cat dog cat elephant" text2 = "the of and a to in is that it was" text3 = "a a a a a aa a a a aa aaaaa" texts = [text1, text2, text3] tokenized = ct.tokenize(texts) freq = ct.frequency(tokenized) ct.head(freq, hits=5) # frequency function iterates over "tokenized texts" first, then over ttokens in those texts, so input must be an iterable of iterables # see source at https://github.com/kristopherkyle/corpus_toolkit/blob/b5f0eba13dee60a0b56a25c5f3f900fe7c8c8cb4/build/lib/corpus_toolkit/corpus_tools.py # what if I include capitals and punctuation print("----") text4 = "Cat? Dog! A man, a plan, a canal, Panama! A dog, a panic in a pagoda. Most most most most most most most most most, most? Most! MOST... most?!?! most: Most most most most most most, (most) [most]." texts.append(text4) tokenized = ct.tokenize(texts) freq = ct.frequency(tokenized) ct.head(freq, hits=10) print("----") corpora_dir = "/home/wesley/Desktop/UOregon Work/CorpusLinguistics/corpora/" text_files = ["dracula.txt", "wuthering_heights.txt"] texts = [] for text_file in text_files: fp = corpora_dir + text_file with open(fp) as f: