Ejemplo n.º 1
0
def make_classifier_builder(args):
	if isinstance(args.classifier, basestring):
		algos = [args.classifier]
	else:
		algos = args.classifier
	
	for algo in algos:
		if algo not in classifier_choices:
			raise ValueError('classifier %s is not supported' % algo)
	
	classifier_train_args = []
	
	for algo in algos:
		classifier_train_kwargs = {}
		
		if algo == 'DecisionTree':
			classifier_train = DecisionTreeClassifier.train
			classifier_train_kwargs['binary'] = False
			classifier_train_kwargs['entropy_cutoff'] = args.entropy_cutoff
			classifier_train_kwargs['depth_cutoff'] = args.depth_cutoff
			classifier_train_kwargs['support_cutoff'] = args.support_cutoff
			classifier_train_kwargs['verbose'] = args.trace
		elif algo == 'NaiveBayes':
			classifier_train = NaiveBayesClassifier.train
		elif algo == 'Scikits':
			classifier_train = ScikitsClassifier.train
		else:
			if algo != 'Maxent':
				classifier_train_kwargs['algorithm'] = algo
				
				if algo == 'MEGAM':
					megam.config_megam()
			
			classifier_train = MaxentClassifier.train
			classifier_train_kwargs['max_iter'] = args.max_iter
			classifier_train_kwargs['min_ll'] = args.min_ll
			classifier_train_kwargs['min_lldelta'] = args.min_lldelta
			classifier_train_kwargs['trace'] = args.trace
		
		classifier_train_args.append((algo, classifier_train, classifier_train_kwargs))
	
	def trainf(train_feats):
		classifiers = []
		
		for algo, classifier_train, train_kwargs in classifier_train_args:
			if args.trace:
				print 'training %s classifier' % algo
			
			classifiers.append(classifier_train(train_feats, **train_kwargs))
		
		if len(classifiers) == 1:
			return classifiers[0]
		else:
			return AvgProbClassifier(classifiers)
	
	return trainf
	#return lambda(train_feats): classifier_train(train_feats, **classifier_train_kwargs)
Ejemplo n.º 2
0
def maximum_entropy(train_set, test_set):
    print "--- nltk.classify.maximum_entropy ---"

    from nltk.classify import megam

    megam.config_megam()

    classifier = nltk.classify.MaxentClassifier.train(train_set, "megam")

    print "Overall accuracy:", accuracy(classifier, test_set)
    classifier.show_most_informative_features(10)
 def __init__(self, train_sents):
     train_set = []
     for tagged_sent in train_sents:
         untagged_sent = nltk.tag.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             featureset = npchunk_features(untagged_sent, i, history)
             train_set.append( (featureset, tag) )
             history.append(tag)
     from nltk.classify import megam
     megam.config_megam(bin='/cs/fs/home/hxiao/code/megam_i686.opt')
     self.classifier = nltk.MaxentClassifier.train(
        train_set, algorithm='megam', trace=0)
Ejemplo n.º 4
0
 def __init__(self, train_sents):
     train_set = []
     for tagged_sent in train_sents:
         untagged_sent = nltk.tag.untag(tagged_sent)
         history = []
         for i, (word, tag) in enumerate(tagged_sent):
             featureset = npchunk_features(untagged_sent, i, history)
             train_set.append((featureset, tag))
             history.append(tag)
     from nltk.classify import megam
     megam.config_megam(bin='/cs/fs/home/hxiao/code/megam_i686.opt')
     self.classifier = nltk.MaxentClassifier.train(train_set,
                                                   algorithm='megam',
                                                   trace=0)
Ejemplo n.º 5
0
def main():
    megam.config_megam(megam_path)

    feature_extractor = BigramFeatureExtractor()
    training_documents = []
    training_documents.append((['muy', 'buena', 'comida', 'hola'], 'pos'))
    training_documents.append((['muy', 'mala', 'comida'], 'neg'))
    training_documents.append((['muy', 'mala', 'comida', 'hola'], 'neg'))
    training_documents.append((['buena', 'comida'], 'pos'))

    training_set = nltk.classify.util.apply_features(feature_extractor.extract,
                                                     training_documents)
    classifier = nltk.MaxentClassifier.train(training_set,
                                             algorithm='megam',
                                             explicit=False,
                                             bernoulli=True,
                                             model='binary')
    classifier.show_most_informative_features()
def make_classifier_builder(args):
	if isinstance(args.classifier, basestring):
		algos = [args.classifier]
	else:
		algos = args.classifier
	
	for algo in algos:
		if algo not in classifier_choices:
			raise ValueError('classifier %s is not supported' % algo)
	
	classifier_train_args = []
	
	for algo in algos:
		classifier_train_kwargs = {}
		
		if algo == 'DecisionTree':
			classifier_train = DecisionTreeClassifier.train
			classifier_train_kwargs['binary'] = False
			classifier_train_kwargs['entropy_cutoff'] = args.entropy_cutoff
			classifier_train_kwargs['depth_cutoff'] = args.depth_cutoff
			classifier_train_kwargs['support_cutoff'] = args.support_cutoff
			classifier_train_kwargs['verbose'] = args.trace
		elif algo == 'NaiveBayes':
			classifier_train = NaiveBayesClassifier.train
		elif algo == 'Svm':
			classifier_train = SvmClassifier.train
		elif algo.startswith('sklearn.'):
			# TODO: support many options for building an estimator pipeline
			pipe = [('classifier', make_sklearn_classifier(algo, args))]
			
			if args.tfidf:
				if args.trace:
					print 'using tfidf transformer with norm %s' % args.penalty
				
				pipe.insert(0, ('tfidf', TfidfTransformer(norm=args.penalty)))
			
			sparse = pipe[-1][1].__class__.__name__ not in dense_classifiers
			
			if not sparse and args.trace:
				print 'using dense matrix'
			
			if args.value_type == 'bool' and not args.tfidf:
				dtype = bool
			elif args.value_type == 'int' and not args.tfidf:
				dtype = int
			else:
				dtype = float
			
			if args.trace:
				print 'using dtype %s' % dtype.__name__
			
			classifier_train = scikitlearn.SklearnClassifier(Pipeline(pipe), dtype=dtype, sparse=sparse).train
		else:
			if algo != 'Maxent':
				classifier_train_kwargs['algorithm'] = algo
				
				if algo == 'MEGAM':
					megam.config_megam()
			
			classifier_train = MaxentClassifier.train
			classifier_train_kwargs['max_iter'] = args.max_iter
			classifier_train_kwargs['min_ll'] = args.min_ll
			classifier_train_kwargs['min_lldelta'] = args.min_lldelta
			classifier_train_kwargs['trace'] = args.trace
		
		classifier_train_args.append((algo, classifier_train, classifier_train_kwargs))
	
	def trainf(train_feats):
		classifiers = []
		
		for algo, classifier_train, train_kwargs in classifier_train_args:
			if args.trace:
				print 'training %s classifier' % algo
			
			classifiers.append(classifier_train(train_feats, **train_kwargs))
		
		if len(classifiers) == 1:
			return classifiers[0]
		else:
			return AvgProbClassifier(classifiers)
	
	return trainf
Ejemplo n.º 7
0
 def __init__(self, n_folds, corpus_size, fold_number, remove_stop_words, use_unigrams, use_unigrams_frequency, use_bigrams, use_all_bigrams, min_word_length, remove_duplicated_chars, process_negation, stem, transform_lower_case, remove_punctuation_marks, remove_accents, lemma, adjectives, allprepro, out_of_domain_test, proportion_of_positives):
     super(CrossValidatedMegamMaxEntClassifier, self).__init__(n_folds, corpus_size, fold_number, remove_stop_words, use_unigrams, use_unigrams_frequency, use_bigrams, use_all_bigrams, min_word_length, remove_duplicated_chars, process_negation, stem, transform_lower_case, remove_punctuation_marks, remove_accents, lemma, adjectives, allprepro, out_of_domain_test, proportion_of_positives)
     
     megam.config_megam(megam_path)
Ejemplo n.º 8
0
import nltk
from nltk.classify import megam
megam.config_megam('/Users/arlogb/ext_sources/megam_0.92/')
#We will use the nltk NaiveBayesClassifer 
def tset(extractor, tok):
    """function wrapping the apply_feature function. Should pass a
    feature extracting function which returns a featureset - dict mapping features to
    feature values. Tok are tokens which extractor will be applied to."""
    trainset = nltk.classify.apply_features(extractor, tok)
    return trainset

def trainclassifier(trainset):
    return nltk.NaiveBayesClassifier.train(trainset)

def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist):
    """A copy of the nltk.NaiveBayesClassifer.train(...)
    method to allow inspection of what the method is actually doing
    and how long it's taking"""
    """ 
    @param labeled_featuresets: A list of classified featuresets, 
             i.e., a list of tuples C{(featureset, label)}. 
          """ 
    label_freqdist = nltk.FreqDist() 
    feature_freqdist = nltk.defaultdict(nltk.FreqDist) 
    feature_values = nltk.defaultdict(set) 
    fnames = set() 

    print 'There are ' + str(len(labeled_featuresets)) + ' labeled featuresets'
    # Count up how many times each feature value occured, given 
    # the label and featurename.
    print 'Counting feature value occurence'
Ejemplo n.º 9
0
        help="use rules that are applied after the maxent algorithm")
    parser.add_argument("--classifier",
                        help="classifying algorithm to use",
                        choices=["maxent", "svm", "dt", "rf"],
                        default="svm")
    args = parser.parse_args()

    inputdir = args.inputdirdevel if args.testdir == "devel" else args.inputdirtest

    train_file = "features/features_megan_train_" + str(args.version) + ".txt"
    test_file = "features/features_megan_" + args.testdir + "_" + str(
        args.version) + ".txt"

    if args.classifier == "maxent":
        # Train
        megam.config_megam("src/megam_i686.opt")
        train = megam.call_megam(["multiclass", train_file])
        with open("features/weights", "w") as f:
            f.write(train)

        # Prediction
        predictions_text = megam.call_megam(
            ["-predict", "features/weights", "multiclass", test_file])
        predictions = [
            y.split("\t")[0] for y in predictions_text.split("\n")
            if len(y) > 1
        ]

    else:  # sklearn classifiers
        # Read train files
        with open(train_file, 'r') as f:
Ejemplo n.º 10
0
        self.old_stdout.flush()
        self.old_stderr.flush()
        sys.stdout, sys.stderr = self._stdout, self._stderr

    def __exit__(self, exc_type, exc_value, traceback):
        self._stdout.flush()
        self._stderr.flush()
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr


try:
    import nltk.classify.megam as megam
    import nltk.classify.maxent as maxent
    with RedirectStdStreams(stdout=sys.stderr):
        megam.config_megam()
    #megam.config_megam(bin="path/to/megam")
    use_megam = True
except:
    print >> sys.stderr, "ERROR: megam not found, configure it before using this"
    print >> sys.stderr, "using default nltk  maxent"
    use_megam = False


class MaxentLearner:
    """
    wrapper to nltk version of max entropy classifier
    TODO: forces to use megam, could use a parameter for other algorithms.

    return_type: whether to return the "label" (a string) or an orange "value"
    """
Ejemplo n.º 11
0
        self.old_stdout.flush()
        self.old_stderr.flush()
        sys.stdout, sys.stderr = self._stdout, self._stderr

    def __exit__(self, exc_type, exc_value, traceback):
        self._stdout.flush()
        self._stderr.flush()
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr


try:
    import nltk.classify.megam as megam
    import nltk.classify.maxent as maxent
    with RedirectStdStreams(stdout=sys.stderr):
        megam.config_megam()
    #megam.config_megam(bin="path/to/megam")
    use_megam = True
except:
    print >> sys.stderr, "ERROR: megam not found, configure it before using this"
    print >> sys.stderr, "using default nltk  maxent"
    use_megam = False


class MaxentLearner:
    """
    wrapper to nltk version of max entropy classifier
    TODO: forces to use megam, could use a parameter for other algorithms.

    return_type: whether to return the "label" (a string) or an orange "value"
    """
Ejemplo n.º 12
0
def make_classifier_builder(args):
    if isinstance(args.classifier, basestring):
        algos = [args.classifier]
    else:
        algos = args.classifier

    for algo in algos:
        if algo not in classifier_choices:
            raise ValueError("classifier %s is not supported" % algo)

    classifier_train_args = []

    for algo in algos:
        classifier_train_kwargs = {}

        if algo == "DecisionTree":
            classifier_train = DecisionTreeClassifier.train
            classifier_train_kwargs["binary"] = False
            classifier_train_kwargs["entropy_cutoff"] = args.entropy_cutoff
            classifier_train_kwargs["depth_cutoff"] = args.depth_cutoff
            classifier_train_kwargs["support_cutoff"] = args.support_cutoff
            classifier_train_kwargs["verbose"] = args.trace
        elif algo == "NaiveBayes":
            classifier_train = NaiveBayesClassifier.train
        elif algo == "Svm":
            classifier_train = SvmClassifier.train
        elif algo.startswith("sklearn."):
            # TODO: support many options for building an estimator pipeline
            pipe = [("classifier", make_sklearn_classifier(algo, args))]
            tfidf = getattr(args, "tfidf", None)
            penalty = getattr(args, "penalty", None)

            if tfidf and penalty:
                if args.trace:
                    print("using tfidf transformer with norm %s" % penalty)

                pipe.insert(0, ("tfidf", TfidfTransformer(norm=penalty)))

            sparse = pipe[-1][1].__class__.__name__ not in dense_classifiers

            if not sparse and args.trace:
                print("using dense matrix")

            value_type = getattr(args, "value_type", "bool")

            if value_type == "bool" and not tfidf:
                dtype = bool
            elif value_type == "int" and not tfidf:
                dtype = int
            else:
                dtype = float

            if args.trace:
                print("using dtype %s" % dtype.__name__)

            classifier_train = scikitlearn.SklearnClassifier(Pipeline(pipe), dtype=dtype, sparse=sparse).train
        else:
            if algo != "Maxent":
                classifier_train_kwargs["algorithm"] = algo

                if algo == "MEGAM":
                    megam.config_megam()

            classifier_train = MaxentClassifier.train
            classifier_train_kwargs["max_iter"] = args.max_iter
            classifier_train_kwargs["min_ll"] = args.min_ll
            classifier_train_kwargs["min_lldelta"] = args.min_lldelta
            classifier_train_kwargs["trace"] = args.trace

        classifier_train_args.append((algo, classifier_train, classifier_train_kwargs))

    def trainf(train_feats):
        classifiers = []

        for algo, classifier_train, train_kwargs in classifier_train_args:
            if args.trace:
                print("training %s classifier" % algo)

            classifiers.append(classifier_train(train_feats, **train_kwargs))

        if len(classifiers) == 1:
            return classifiers[0]
        else:
            return AvgProbClassifier(classifiers)

    return trainf
Ejemplo n.º 13
0
import nltk
from nltk.classify import megam

megam.config_megam("/Users/arlogb/ext_sources/megam_0.92/")
# We will use the nltk NaiveBayesClassifer
def tset(extractor, tok):
    """function wrapping the apply_feature function. Should pass a
    feature extracting function which returns a featureset - dict mapping features to
    feature values. Tok are tokens which extractor will be applied to."""
    trainset = nltk.classify.apply_features(extractor, tok)
    return trainset


def trainclassifier(trainset):
    return nltk.NaiveBayesClassifier.train(trainset)


def NBCtrain(labeled_featuresets, estimator=nltk.ELEProbDist):
    """A copy of the nltk.NaiveBayesClassifer.train(...)
    method to allow inspection of what the method is actually doing
    and how long it's taking"""
    """ 
    @param labeled_featuresets: A list of classified featuresets, 
             i.e., a list of tuples C{(featureset, label)}. 
          """
    label_freqdist = nltk.FreqDist()
    feature_freqdist = nltk.defaultdict(nltk.FreqDist)
    feature_values = nltk.defaultdict(set)
    fnames = set()

    print "There are " + str(len(labeled_featuresets)) + " labeled featuresets"
Ejemplo n.º 14
0
Archivo: app.py Proyecto: gennad/thesis
    import ipdb

    ipdb.set_trace()
    for clf in clfs:
        evaluate_cross_validataion(clf, data, target, 5)
    import ipdb

    ipdb.set_trace()

    if not os.path.exists(MULTI_CLASSIFIER_NAME):
        st = str("/usr/local/bin/megam")
        import ipdb

        ipdb.set_trace()
        config_megam(st)

        rwords = reuters_high_info_words()
        featdet = lambda words: bag_of_words_in_set(words, rwords)
        multi_train_feats, multi_test_feats = reuters_train_test_feats(featdet)

        trainf = lambda train_feats: MaxentClassifier.train(train_feats, algorithm="megam", trace=0, max_iter=10)
        # labelset = set(reuters.categories())
        labelset = set(list(publics.keys()))
        classifiers = train_binary_classifiers(trainf, multi_train_feats, labelset)
        len(classifiers)

        multi_classifier = MultiBinaryClassifier(*classifiers.items())

        output = open(MULTI_CLASSIFIER_NAME, "wb")
        pickle.dump(multi_classifier, output)
##########################
from nltk.data import path as nltk_data_path
nltk_data_location = os.getenv('NLTK_DATA_PATH')
if nltk_data_location is not None:
    nltk_data_path.append(nltk_data_location)

# EXTERNAL LIBRARIES
##########################
# NOTE Set this directory to wherever megam, MITIE, Stanford NER and SENNA are
# located.
EXTLIB_DIR = '/Data/nlp/utilities/'

#TODO this might need a try/except as well:
MEGAM_DIR = EXTLIB_DIR + 'megam_0.92/'
try:
    megam.config_megam(MEGAM_DIR + 'megam.opt')
except:
    print("megam is not installed or not configured correctly.")

MITIE_DIR = EXTLIB_DIR + 'MITIE/'
MITIE_LIB_DIR = MITIE_DIR + 'mitielib/'
sys.path.append(MITIE_LIB_DIR)

try:
    import mitie
except:
    print("To use the pre-trained MITIE model, you will need to install the " +
          "MITIE Python wrapper.")
try:
    stanford_ner_path = os.environ.get('CLASSPATH')
except:
Ejemplo n.º 16
0
import optparse
import sys
import bleu_smooth
from nltk.classify.megam import call_megam, parse_megam_weights,config_megam
optparser = optparse.OptionParser()
optparser.add_option("-k", "--kbest-list", dest="train", default="data/train.100best", help="100-best translation lists")
optparser.add_option("-d", "--dev_kbest-list", dest="dev", default="data/dev+test.100best", help="100-best translation lists")
optparser.add_option("-r", "--reference", dest="reference", default="data/train.ref", help="Target language reference sentences")
(opts, _) = optparser.parse_args()
lm = tm1 = -0.92
tm2 = -1
megam_features = []
samples = []
sent_features = []
sign = lambda x: (1, -1)[x<0]
config_megam("/usr/local/bin/")
#Read Reference Translation for Training
ref = [line.strip().split() for line in open(opts.reference)]
#Read Candidate Translations for Training
all_hyps = [pair.split(' ||| ') for pair in open(opts.train)]
num_sents = len(all_hyps) / 100
bleu_score_per_sent = []
for s in xrange(0, num_sents):
  del bleu_score_per_sent[:]
  del sent_features[:]
  del samples[:]
  empty=0
  hyps_for_one_sent = all_hyps[s * 100:s * 100 + 100]
  #compute BLEU+1 for label and read/compute feature values
  for (num, hyp, feats) in hyps_for_one_sent:
      untranslated=0