コード例 #1
0
def features_brown_news(path):
    name = 'features_brown_news'
    tagged_words = brown.tagged_words(categories='news')
    common_suffixes = get_common_suffixes()
    featuresets = [(pos_features(common_suffixes, n), g) for (n,g) in tagged_words]
    log("featuresets")
    save_features(featuresets, path, name)
    log("dump featuresets")
    return name
コード例 #2
0
def get_common_suffixes():
    suffix_fdist = nltk.FreqDist()
    for word in brown.words():
         word = word.lower()
         suffix_fdist.inc(word[-1:])
         suffix_fdist.inc(word[-2:])
         suffix_fdist.inc(word[-3:])
     
    common_suffixes = suffix_fdist.keys()[:100]
    log("common_suffixes")    
    return common_suffixes
コード例 #3
0
def train_naive_bayes(path, fname):
    name='naive_bayes' + '_._' + fname
    featuresets = load_features(path,fname)
    log("load featuresets")
    train_set = featuresets[1:200]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    log("train classifier")
    save_classifiers(classifier, path, name)
    log("dump classifier")
コード例 #4
0
def train_decision_tree(path, fname):
    name = 'decision_tree' + '_._' + fname
    featuresets = load_features(path, fname)
    log("load featuresets")
    train_set = featuresets[1:200]
    classifier = nltk.DecisionTreeClassifier.train(train_set)
    log("train classifier")
    save_classifiers(classifier, path, name)
    log("dump classifier")
def train_decision_tree(path, fname):
    name='decision_tree' + '_._' + fname
    featuresets = load_features(path,fname)
    log("load featuresets")
    train_set = featuresets[1:200]
    classifier = nltk.DecisionTreeClassifier.train(train_set)
    log("train classifier")
    save_classifiers(classifier, path, name)
    log("dump classifier")
コード例 #6
0
def pos_features(common_suffixes, word):
    features = {}    
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    log("pos_features")
    return features
コード例 #7
0
#                                                #
# @file:    intfc_nltk_features.py
# @author:    [email protected]
# @brief:   collect features and dump
#                                                #
 
import sys,os,time,nltk, cPickle as pickle
from intfc_common_io import log, home, save_features, load_features

from nltk.corpus import *
log("import nltk.corpus")


##
#    
#    @brief:    finding out what the most common suffixes
##

def get_common_suffixes():
    suffix_fdist = nltk.FreqDist()
    for word in brown.words():
         word = word.lower()
         suffix_fdist.inc(word[-1:])
         suffix_fdist.inc(word[-2:])
         suffix_fdist.inc(word[-3:])
     
    common_suffixes = suffix_fdist.keys()[:100]
    log("common_suffixes")    
    return common_suffixes