def generate_mutual_info_distributions(lang_dict): mutual_info = compute_bigram_and_unigram_mutual_info(lang_dict) corp = read_in_corpus(read_in_corpus) distribution_to_word = {} distros = [] for word in corp: distro = distribution_for_word(mutual_info, word)
def compute_bigram_and_unigram_mutual_info(filename): """ generate bigram and unigram counts for all letters in all words in an input corpus """ # initial variables corpus = read_in_corpus(filename) # using ' ' as stop char, and '#' as start char unigrams = { ' ': 0, # '#': 0 } unigram_count = 0 bigrams = {} bigrams_count = 0 # generate counts for word in corpus: # unigrams['#'] += 1 # unigram_count += 1 # start_pair = '#' + word[0] # if start_pair not in bigrams: # bigrams[start_pair] = 0 # bigrams[start_pair] += 1 # bigrams_count += 1 for idx in range(0, len(word)): letter = word[idx] if letter not in unigrams: unigrams[letter] = 0 unigrams[letter] += 1 unigram_count += 1 # if idx < (len(word) - 1): second_letter = word[idx + 1] if idx < (len(word) - 1) else ' ' pair = letter + second_letter if pair not in bigrams: bigrams[pair] = 0 bigrams[pair] += 1 bigrams_count += 1 unigrams[' '] += 1 unigram_count += 1 # normalize counts to freqs for key, value in unigrams.items(): unigrams[key] = (value + 0.0) / unigram_count for key, value in bigrams.items(): bigrams[key] = (value + 0.0) / bigrams_count # build mutual information for input letter pair mutual_information_dict = {} for key, value in bigrams.items(): prob_a = unigrams[key[0]] prob_b = unigrams[key[1]] prob_ab = bigrams[key] mutual_information_dict[key] = log(prob_ab) - log(prob_a) - log(prob_b) return mutual_information_dict
def test_classifier(classifier, filename): """ compare classifier behaviour with collection of known compounds """ gold_standard_a = read_in_corpus(filename) # gold_standard_b = read_in_corpus('./data/German_compound_words.txt') gold_standard = gold_standard_a# + gold_standard_b gold_count = 0 gold_misses = [] for word in gold_standard: if classifier.classify(compund_features(word)) == 'compound': gold_count += 1 else: gold_misses.append(word) print "Classifier correct categorizedly {0}/{1} known compounds".format(gold_count, len(gold_standard)) print "The first 500 it missed were {0}".format(', '.join(gold_misses[:500]))
def build_training_data(filename): input_corpus = read_in_corpus(filename) # build dictionary lang_dict = {} for word in input_corpus: lang_dict[word] = True # evaluate whether or not every word in dictionary is a compound suspected_words = [] for word in input_corpus: word_with_associates = { "word": word, "sub_words": [] } if len(word) < 4: continue # naive algorithm for idx in range(2, len(word) - 2): left_word = word[0:idx] right_word = word[idx:len(word)] if (left_word in lang_dict) and (right_word in lang_dict): word_with_associates['sub_words'].append((left_word, right_word)) if len(word_with_associates['sub_words']): suspected_words.append(word_with_associates) found_words = {} for sus_word in suspected_words: found_words[sus_word["word"]] = True non_compounds = [] compounds = [] for word in input_corpus: if len(word) == 0: continue if word in found_words: compounds.append(word) else: non_compounds.append(word) # should use this as a filter against false positive/false negatives? # base_gold_check(lang_dict, found_words) return (compounds, non_compounds)
def base_gold_check(eng_dict, found_words): count = 0 gold_standard = read_in_corpus(COMPOUND_GOLD_STANDARD) not_found = [] for word in gold_standard: # if the word was in the predicated set cool count it if word in found_words: count += 1 else: # if the word was not in the predicated set but was present in the dictionaty we missed it if word in eng_dict: not_found.append(word) compounds_known_to_be_in_dict = 0 for word in gold_standard: if word in eng_dict: compounds_known_to_be_in_dict += 1 print "The naive cutting technique picked up {0}/{1} known compounds that were present in the original diction".format(count, compounds_known_to_be_in_dict) print "It missed: {0}".format(', '.join(not_found))
def build_training_data(filename): input_corpus = read_in_corpus(filename) # build dictionary lang_dict = {} for word in input_corpus: lang_dict[word] = True # evaluate whether or not every word in dictionary is a compound suspected_words = [] for word in input_corpus: word_with_associates = {"word": word, "sub_words": []} if len(word) < 4: continue # naive algorithm for idx in range(2, len(word) - 2): left_word = word[0:idx] right_word = word[idx:len(word)] if (left_word in lang_dict) and (right_word in lang_dict): word_with_associates['sub_words'].append( (left_word, right_word)) if len(word_with_associates['sub_words']): suspected_words.append(word_with_associates) found_words = {} for sus_word in suspected_words: # print "word: {0} subwords: {1}".format(sus_word["word"], sus_word["sub_words"]) found_words[sus_word["word"]] = True non_compounds = [] compounds = [] for word in input_corpus: if word in found_words: compounds.append(word) else: non_compounds.append(word) base_gold_check(lang_dict, found_words) return (compounds, non_compounds)
import codecs from utils import read_in_corpus lines = read_in_corpus('./data/german-dict-2.txt') output_words = [] for line in lines: words = line.split(' ')[0] maybe_more_words = words.split(',') for word in maybe_more_words: output_words.append(word.lower()) f = codecs.open("data/german-dict-final.txt", "w+", "utf-8") f.write('\n'.join(output_words))
import re from utils import read_in_corpus lines = read_in_corpus('./data/english-cmudict.dx1.txt') output_words = [] # splitter = re.search('^(\w*)(\S+|\()') for line in lines: # .split(s) # print line # words = splitter.findall(line) # print re.search('(^|\{)(\w*)(\S|\()', line).group(0) # word = words[0] word = re.search('(\w*)(|\S|[\P{P}-])', line).group(0) # print word if len(word) > 1: output_words.append(word.lower()) # maybe_more_words = words.split('/\s+|\(/g') # for word in maybe_more_words: f = open("english-cmu-reformat.txt", "w+") f.write('\n'.join(output_words))