def get_word_frequencies(): """ Return a list of (word, frequency) tuples, sorted by frequency, form most to least frequent. """ tagged_corpus = unpickle_cds() words = [] for file_ in tagged_corpus: for sentence_ in tagged_corpus[file_]: sentence_ = [(token, collapse_function_tags(pos_tag)) for token, pos_tag in sentence_] words += sentence_ counted_tokens = Counter(words) word_frequencies = sorted(counted_tokens.items(), key=operator.itemgetter(1)) word_frequencies.reverse() return word_frequencies
def get_cds_words(collapse_function_words=True): """ Load CDS from disk and return lower-cased tokens, as a list of sentences, where a sentence is a list of POS-ttagged tokens. Tokens are in the format 'word-pos_tag'. Optionally replace all closed class / function word POS tags with the single tag 'fn'. """ sentences = [] CDS = unpickle_cds() for file_name in CDS: for s in CDS[file_name]: sentences.append([]) for w, pos_tag in s: w = w.lower() if collapse_function_words: sentences[-1].append(w + "-" + collapse_function_tags(pos_tag)) else: sentences[-1].append(w + "-" + pos_tag) return sentences