Beispiel #1
0
def setup_module():
    global weights_nb
    global counts
    global class_counts
    global allkeys
    counts, class_counts, allkeys = getCountsAndKeys(TRAINKEY)
    weights_nb = learnNBWeights(counts, class_counts, allkeys)
def setup_module ():
    global weights_nb
    global counts
    global class_counts
    global allkeys
    counts, class_counts, allkeys = getCountsAndKeys (TRAINKEY)
    weights_nb = learnNBWeights (counts, class_counts, allkeys)
Beispiel #3
0
def test_nb_simple():
    '''
    Tests for the following two sentences: 
    the D
    man N
    runs V

    man V
    the D
    cannons N
    '''
    allwords = ['the', 'man', 'runs', 'the', 'cannons']
    wordCountsByTag = Counter({
        'D': Counter({'the': 2}),
        'N': Counter({'man': 1, 'cannons': 1}),
        'V': Counter({'runs': 1, 'man': 1})
    })
    classCounts = Counter({'D': 2, 'N': 2, 'V': 2})

    weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts, allwords, alpha=0)
    assert_almost_equals(0.5, np.exp(weights[('N', 'man')]), places=3)
    assert_almost_equals(0.5, np.exp(weights[('V', 'man')]), places=3)
    assert_almost_equals(1.0, np.exp(weights[('D', 'the')]), places=3)

    # offsets
    assert_almost_equals(0.333, np.exp(weights[('N', OFFSET)]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('V', OFFSET)]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('D', OFFSET)]), places=3)
Beispiel #4
0
def test_nb_smoothing():
    '''
    Tests for the following two sentences, with smoothing of 0.5
    the D
    man N
    runs V

    man V
    the D
    cannons N
    '''
    allwords = ['the', 'man', 'runs', 'the', 'cannons']
    wordCountsByTag = Counter({
        'D': Counter({'the': 2}),
        'N': Counter({'man': 1, 'cannons': 1}),
        'V': Counter({'runs': 1, 'man': 1})
    })
    classCounts = Counter({'D': 2, 'N': 2, 'V': 2})

    # smoothing of 0.5 reserves 1/2 probability mass for unknown
    weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts,
                                        allwords, alpha=0.5)
    assert_almost_equals(5.0 / 8.0, np.exp(weights[('D', 'the')]), places=3)
    assert_almost_equals(1.0 / 8.0, np.exp(weights[('N', 'the')]))

    assert_almost_equals(0.333, np.exp(weights[('N', OFFSET)]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('V', OFFSET)]), places=3)
    # offsets unchanged
    assert_almost_equals(0.333, np.exp(weights[('D', OFFSET)]), places=3)
Beispiel #5
0
def setup_module ():
    global weights_nb
    global alltags
    global allwords
    counters = most_common.get_tags(TRAIN_FILE)
    for counts in counters.values():
        allwords.update(set(counts.keys()))
    class_counts =  most_common.get_class_counts(counters)
    weights_nb = naivebayes.learnNBWeights(counters,class_counts,allwords)
    alltags = preproc.getAllTags(TRAIN_FILE)
Beispiel #6
0
def setup_module():
    global weights_nb
    global alltags
    global allwords
    counters = most_common.get_tags(TRAIN_FILE)
    for counts in counters.values():
        allwords.update(set(counts.keys()))
    class_counts = most_common.get_class_counts(counters)
    weights_nb = naivebayes.learnNBWeights(counters, class_counts, allwords)
    alltags = preproc.getAllTags(TRAIN_FILE)
Beispiel #7
0
def get_HMM_weights(trainfile):
    """Train a set of of log-prob weights using HMM transition model
        Parameters:
        trainfile -- The name of the file to train weights
        Returns:
        weights -- Weights dict with log-prob of transition and emit features
        """
    # compute naive bayes weights

    # convert nb weights to hmm weights
    counters = most_common.get_tags(trainfile)
    allwords = set()
    for counts in counters.values():
        allwords.update(set(counts.keys()))
    class_counts = most_common.get_class_counts(counters)
    nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords,
                                           0.001)

    trans_cnt = defaultdict(Counter)
    with open(trainfile) as instances:
        prev_tag = START_TAG
        for line in instances:
            if len(line.rstrip()) == 0:
                trans_cnt[prev_tag][END_TAG] += 1
                prev_tag = START_TAG
                continue

            parts = line.rstrip().split()
            if len(parts) > 1:
                cur_tag = parts[1]
            else:
                cur_tag = UNKNOWN

            trans_cnt[prev_tag][cur_tag] += 1
            prev_tag = cur_tag
        if prev_tag != START_TAG:
            trans_cnt[prev_tag][END_TAG] += 1

    hmm_weights = defaultdict(lambda: -1000.)
    for key in nb_weights:
        tag = key[0]
        word = key[1]
        hmm_weights[(tag, word, EMIT)] = nb_weights[key]

    for prev_tag in trans_cnt:
        cnt = trans_cnt[prev_tag]
        total_pairs = sum(cnt.values())
        for cur_tag in cnt:
            hmm_weights[(cur_tag, prev_tag,
                         TRANS)] = np.log(cnt[cur_tag]) - np.log(total_pairs)

    return hmm_weights
Beispiel #8
0
def test_nb_one_class():
    allwords = ['football', 'spoon', 'dog']
    wordCountsByTag = Counter({
        'N': Counter({
            'football': 1,
            'spoon': 1,
            'dog': 1
        })
    })
    classCounts = Counter({'N': 3})
    weights = naivebayes.learnNBWeights(wordCountsByTag, classCounts, allwords, alpha=0)
    assert_almost_equals(0.333, np.exp(weights[('N', 'spoon')]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('N', 'football')]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('N', 'dog')]), places=3)
Beispiel #9
0
def get_HMM_weights(trainfile):
    """Train a set of of log-prob weights using HMM transition model
        Parameters:
        trainfile -- The name of the file to train weights
        Returns:
        weights -- Weights dict with log-prob of transition and emit features
        """
    # compute naive bayes weights
    
    # convert nb weights to hmm weights
    counters = most_common.get_tags(trainfile)
    allwords = set()
    for counts in counters.values():
        allwords.update(set(counts.keys()))        
    class_counts = most_common.get_class_counts(counters)
    nb_weights = naivebayes.learnNBWeights(counters,class_counts,allwords,0.001)
    
    trans_cnt = defaultdict(Counter)
    with open(trainfile) as instances:
        prev_tag = START_TAG
        for line in instances:
            if len(line.rstrip()) == 0:
                trans_cnt[prev_tag][END_TAG] += 1
                prev_tag = START_TAG
                continue
            
            parts = line.rstrip().split()
            if len(parts) >1:
                cur_tag = parts[1]
            else: 
                cur_tag = UNKNOWN
            
            trans_cnt[prev_tag][cur_tag] += 1
            prev_tag = cur_tag                                
        if prev_tag != START_TAG:
            trans_cnt[prev_tag][END_TAG] += 1
                   
    hmm_weights = defaultdict(lambda : -1000.)
    for key in nb_weights:
        tag = key[0]
        word = key[1]
        hmm_weights[(tag, word, EMIT)] = nb_weights[key]
        
    for prev_tag in trans_cnt:
        cnt = trans_cnt[prev_tag]
        total_pairs = sum(cnt.values())        
        for cur_tag in cnt:
            hmm_weights[(cur_tag, prev_tag, TRANS)] = np.log(cnt[cur_tag]) - np.log(total_pairs) 
    
    return hmm_weights
Beispiel #10
0
def test_nb_one_class():
    allwords = ['football', 'spoon', 'dog']
    wordCountsByTag = Counter(
        {'N': Counter({
            'football': 1,
            'spoon': 1,
            'dog': 1
        })})
    classCounts = Counter({'N': 3})
    weights = naivebayes.learnNBWeights(wordCountsByTag,
                                        classCounts,
                                        allwords,
                                        alpha=0)
    assert_almost_equals(0.333, np.exp(weights[('N', 'spoon')]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('N', 'football')]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('N', 'dog')]), places=3)
Beispiel #11
0
def get_HMM_weights(trainfile):
    """Train a set of of log-prob weights using HMM transition model
        Parameters:
        trainfile -- The name of the file to train weights
        Returns:
        weights -- Weights dict with log-prob of transition and emit features

        ngrams("I really like", 2)
            (I, really)
            (really, like)
        (end_tag,'N',trans)  => q(stop/N) => (N, end_tag)
        q(stop/N) = count(stop, N) / count(N)
        """
    # compute naive bayes weights
    counters = most_common.get_tags(trainfile)
    class_counts = most_common.get_class_counts(counters)
    allwords = set()
    for counts in counters.values():
        allwords.update(set(counts.keys()))

    nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords, alpha=0.001)

    # convert nb weights to hmm weights
    hmm_weights = defaultdict(lambda: -1000.0)
    for (tag, word), weight in nb_weights.iteritems():
        hmm_weights[(tag, word, EMIT)] = weight

    unigram = preproc.getNgrams(trainfile)
    bigram = preproc.getNgrams(trainfile, 2)
    unigramCount = preproc.getAllCounts(unigram)
    bigramCount = preproc.getAllCounts(bigram)

    for (tag1, tag2) in bigramCount.keys():
        hmm_weights[(tag2, tag1, TRANS)] = np.log(1.0 * bigramCount.get((tag1, tag2), 0)) - np.log(
            unigramCount.get(tag1, 0)
        )

    return hmm_weights
Beispiel #12
0
def test_nb_smoothing():
    '''
    Tests for the following two sentences, with smoothing of 0.5
    the D
    man N
    runs V

    man V
    the D
    cannons N
    '''
    allwords = ['the', 'man', 'runs', 'the', 'cannons']
    wordCountsByTag = Counter({
        'D': Counter({'the': 2}),
        'N': Counter({
            'man': 1,
            'cannons': 1
        }),
        'V': Counter({
            'runs': 1,
            'man': 1
        })
    })
    classCounts = Counter({'D': 2, 'N': 2, 'V': 2})

    # smoothing of 0.5 reserves 1/2 probability mass for unknown
    weights = naivebayes.learnNBWeights(wordCountsByTag,
                                        classCounts,
                                        allwords,
                                        alpha=0.5)
    assert_almost_equals(5.0 / 8.0, np.exp(weights[('D', 'the')]), places=3)
    assert_almost_equals(1.0 / 8.0, np.exp(weights[('N', 'the')]))

    assert_almost_equals(0.333, np.exp(weights[('N', OFFSET)]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('V', OFFSET)]), places=3)
    # offsets unchanged
    assert_almost_equals(0.333, np.exp(weights[('D', OFFSET)]), places=3)
Beispiel #13
0
def test_nb_simple():
    '''
    Tests for the following two sentences: 
    the D
    man N
    runs V

    man V
    the D
    cannons N
    '''
    allwords = ['the', 'man', 'runs', 'the', 'cannons']
    wordCountsByTag = Counter({
        'D': Counter({'the': 2}),
        'N': Counter({
            'man': 1,
            'cannons': 1
        }),
        'V': Counter({
            'runs': 1,
            'man': 1
        })
    })
    classCounts = Counter({'D': 2, 'N': 2, 'V': 2})

    weights = naivebayes.learnNBWeights(wordCountsByTag,
                                        classCounts,
                                        allwords,
                                        alpha=0)
    assert_almost_equals(0.5, np.exp(weights[('N', 'man')]), places=3)
    assert_almost_equals(0.5, np.exp(weights[('V', 'man')]), places=3)
    assert_almost_equals(1.0, np.exp(weights[('D', 'the')]), places=3)

    # offsets
    assert_almost_equals(0.333, np.exp(weights[('N', OFFSET)]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('V', OFFSET)]), places=3)
    assert_almost_equals(0.333, np.exp(weights[('D', OFFSET)]), places=3)