def generate_mutual_info_distributions(lang_dict):
    mutual_info = compute_bigram_and_unigram_mutual_info(lang_dict)
    corp = read_in_corpus(read_in_corpus)
    distribution_to_word = {}
    distros = []
    for word in corp:
        distro = distribution_for_word(mutual_info, word)
Exemple #2
0
def compute_bigram_and_unigram_mutual_info(filename):
    """
    generate bigram and unigram counts for all letters in all words in an input corpus
    """
    # initial variables
    corpus = read_in_corpus(filename)
    # using ' ' as stop char, and '#' as start char
    unigrams = {
        ' ': 0,
        # '#': 0
    }
    unigram_count = 0
    bigrams = {}
    bigrams_count = 0

    # generate counts
    for word in corpus:
        # unigrams['#'] += 1
        # unigram_count += 1
        # start_pair = '#' + word[0]
        # if start_pair not in bigrams:
        #     bigrams[start_pair] = 0
        # bigrams[start_pair] += 1
        # bigrams_count += 1

        for idx in range(0, len(word)):
            letter = word[idx]
            if letter not in unigrams:
                unigrams[letter] = 0
            unigrams[letter] += 1
            unigram_count += 1
            # if idx < (len(word) - 1):
            second_letter = word[idx + 1] if idx < (len(word) - 1) else ' '
            pair = letter + second_letter
            if pair not in bigrams:
                bigrams[pair] = 0
            bigrams[pair] += 1
            bigrams_count += 1

        unigrams[' '] += 1
        unigram_count += 1

    # normalize counts to freqs
    for key, value in unigrams.items():
        unigrams[key] = (value + 0.0) / unigram_count
    for key, value in bigrams.items():
        bigrams[key] = (value + 0.0) / bigrams_count

    # build mutual information for input letter pair
    mutual_information_dict = {}
    for key, value in bigrams.items():
        prob_a = unigrams[key[0]]
        prob_b = unigrams[key[1]]
        prob_ab = bigrams[key]
        mutual_information_dict[key] = log(prob_ab) - log(prob_a) - log(prob_b)
    return mutual_information_dict
def test_classifier(classifier, filename):
    """
    compare classifier behaviour with collection of known compounds
    """
    gold_standard_a = read_in_corpus(filename)
    # gold_standard_b = read_in_corpus('./data/German_compound_words.txt')
    gold_standard = gold_standard_a# + gold_standard_b
    gold_count = 0
    gold_misses = []
    for word in gold_standard:
        if classifier.classify(compund_features(word)) == 'compound':
            gold_count += 1
        else:
            gold_misses.append(word)
    print "Classifier correct categorizedly {0}/{1} known compounds".format(gold_count, len(gold_standard))
    print "The first 500 it missed were {0}".format(', '.join(gold_misses[:500]))
def build_training_data(filename):
    input_corpus = read_in_corpus(filename)
    # build dictionary
    lang_dict = {}
    for word in input_corpus:
        lang_dict[word] = True
    # evaluate whether or not every word in dictionary is a compound

    suspected_words = []
    for word in input_corpus:
        word_with_associates = {
            "word": word,
            "sub_words": []
        }
        if len(word) < 4:
            continue
        # naive algorithm
        for idx in range(2, len(word) - 2):
            left_word = word[0:idx]
            right_word = word[idx:len(word)]
            if (left_word in lang_dict) and (right_word in lang_dict):
                word_with_associates['sub_words'].append((left_word, right_word))
        if len(word_with_associates['sub_words']):
            suspected_words.append(word_with_associates)

    found_words = {}
    for sus_word in suspected_words:
        found_words[sus_word["word"]] = True

    non_compounds = []
    compounds = []
    for word in input_corpus:
        if len(word) == 0:
            continue
        if word in found_words:
            compounds.append(word)
        else:
            non_compounds.append(word)


    # should use this as a filter against false positive/false negatives?
    # base_gold_check(lang_dict, found_words)

    return (compounds, non_compounds)
def base_gold_check(eng_dict, found_words):
    count = 0
    gold_standard = read_in_corpus(COMPOUND_GOLD_STANDARD)
    not_found = []
    for word in gold_standard:
        # if the word was in the predicated set cool count it
        if word in found_words:
            count += 1
        else:
            # if the word was not in the predicated set but was present in the dictionaty we missed it
            if word in eng_dict:
                not_found.append(word)

    compounds_known_to_be_in_dict = 0
    for word in gold_standard:
        if word in eng_dict:
            compounds_known_to_be_in_dict += 1
    print "The naive cutting technique picked up {0}/{1} known compounds that were present in the original diction".format(count, compounds_known_to_be_in_dict)
    print "It missed: {0}".format(', '.join(not_found))
def build_training_data(filename):
    input_corpus = read_in_corpus(filename)
    # build dictionary
    lang_dict = {}
    for word in input_corpus:
        lang_dict[word] = True
    # evaluate whether or not every word in dictionary is a compound

    suspected_words = []
    for word in input_corpus:
        word_with_associates = {"word": word, "sub_words": []}
        if len(word) < 4:
            continue
        # naive algorithm
        for idx in range(2, len(word) - 2):
            left_word = word[0:idx]
            right_word = word[idx:len(word)]
            if (left_word in lang_dict) and (right_word in lang_dict):
                word_with_associates['sub_words'].append(
                    (left_word, right_word))
        if len(word_with_associates['sub_words']):
            suspected_words.append(word_with_associates)

    found_words = {}
    for sus_word in suspected_words:
        # print "word: {0} subwords: {1}".format(sus_word["word"], sus_word["sub_words"])
        found_words[sus_word["word"]] = True

    non_compounds = []
    compounds = []
    for word in input_corpus:
        if word in found_words:
            compounds.append(word)
        else:
            non_compounds.append(word)

    base_gold_check(lang_dict, found_words)

    return (compounds, non_compounds)
import codecs
from utils import read_in_corpus

lines = read_in_corpus('./data/german-dict-2.txt')
output_words = []
for line in lines:
    words = line.split(' ')[0]
    maybe_more_words = words.split(',')
    for word in maybe_more_words:
        output_words.append(word.lower())

f = codecs.open("data/german-dict-final.txt", "w+",  "utf-8")
f.write('\n'.join(output_words))
Exemple #8
0
import re
from utils import read_in_corpus

lines = read_in_corpus('./data/english-cmudict.dx1.txt')
output_words = []
# splitter = re.search('^(\w*)(\S+|\()')
for line in lines:
    # .split(s)
    # print line
    # words = splitter.findall(line)

    # print re.search('(^|\{)(\w*)(\S|\()', line).group(0)
    # word = words[0]
    word = re.search('(\w*)(|\S|[\P{P}-])', line).group(0)
    # print word
    if len(word) > 1:
        output_words.append(word.lower())
    # maybe_more_words = words.split('/\s+|\(/g')
    # for word in maybe_more_words:

f = open("english-cmu-reformat.txt", "w+")
f.write('\n'.join(output_words))