コード例 #1
0
ファイル: nbdatica.py プロジェクト: liangxh/idu
def main():
	optparser = OptionParser()
	
	optparser.add_option('-x', '--dname_x', action='store', type = 'str', dest='dname_x')
	optparser.add_option('-s', '--dname_xsup', action='store', type = 'str', dest='dname_xsup')
	optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.)
	optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim', default = N_EMO)

	opts, args = optparser.parse_args()

	print >> sys.stderr, 'nbdatica: [info] loading data for training NaiveBayes ... ',
	train, valid, test = datica.load_data(opts.dname_x, opts.ydim, valid_rate = 0.)
	print >> sys.stderr, 'OK'

	print >> sys.stderr, 'nbdatica: [info] training NaiveBayes ... ',	
	classifier = NaiveBayesClassifier()
	classifier.train(train[0], train[1], opts.value_k)
	print >> sys.stderr, 'OK'

	if not os.path.exists(opts.dname_xsup):
		os.mkdir(opts.dname_xsup)

	pbar = progbar.start(opts.ydim)
	for eid in range(opts.ydim):
		ifname = opts.dname_x + '%d.pkl'%(eid)
		seqs = cPickle.load(open(ifname, 'r'))

		ofname = opts.dname_xsup + '%d.pkl'%(eid)
		proba = [classifier.classify(seq) for seq in seqs]

		cPickle.dump(proba, open(ofname, 'w'))
		pbar.update(eid + 1)
	pbar.finish()
コード例 #2
0
ファイル: test_naivebayes.py プロジェクト: beckgom/python-nlp
	def test_single_training_data(self):
		classifier = NaiveBayesClassifier()
		classifier.train((('A', 'a'),))

		self.failUnless(classifier.label('a') == 'A')
		distribution = classifier.label_distribution('a')
		self.failUnlessEqual(len(distribution), 1)
		self.failUnless('A' in distribution)
		self.failUnless(distribution['A'] == 0.0, distribution)
コード例 #3
0
ファイル: contextprocessor.py プロジェクト: liangxh/idu
def prepare_above_naivebayes(dname_dataset, idname, odname, n_emo, k = 1, ratio = 0.9):
	train_x = []
	train_y = []
	dlist = []


	dir_dataset = 'data/blogs/%s/'%(dname_dataset)

	idir = dir_dataset + '%s/'%(idname)
	odir = dir_dataset + '%s/'%(odname)

	init_folders([odir, ])

	print >> sys.stderr, 'contextprocessor: [info] loading data'
	for eid in range(n_emo):
		xlist = []

		ifname = idir + '%d.pkl'%(eid)
		contextu = cPickle.load(open(ifname, 'r'))
		
		n_train = int(len(contextu) * ratio)

		for i, comms in enumerate(contextu):
			tokens = []
			for ts, emos in comms:
				tokens.extend(ts)

			xlist.append(tokens)

			if i < n_train:
				train_x.append(tokens)
				train_y.append(eid)

		dlist.append(xlist)

		print >> sys.stderr, '\t%s OK'%(ifname)

	print >> sys.stderr, 'contextprocessor: [info] training naive bayes classifier'
	classifier = NaiveBayesClassifier()
	classifier.train(train_x, train_y, k)
	
	print >> sys.stderr, 'contextprocessor: [info] exporting naive bayes result'
	for eid, xlist in enumerate(dlist):
		probs = []
		for tokens in xlist:
			probs.append(classifier.classify(tokens))
		
		ofname = odir + '%d.pkl'%(eid)
		print >> sys.stderr, '\t%s OK'%(ofname)
		cPickle.dump(probs, open(ofname, 'w'))
コード例 #4
0
ファイル: test_naivebayes.py プロジェクト: beckgom/python-nlp
	def test_single_class_mixed_training_data(self):
		classifier = NaiveBayesClassifier()
		classifier.train((('A', 'a'),('A', 'a'),('B', 'a')))

		self.failUnless(classifier.label('a') == 'A')
		distribution = classifier.label_distribution('a')
		self.failUnlessEqual(len(distribution), 2)
		self.failUnless('A' in distribution)

		correct_distribution = Counter()
		correct_distribution['A'] = (2.0 / 3.0)**3
		correct_distribution['B'] = (1.0 / 3.0)**3
		correct_distribution.normalize()
		correct_distribution.log()

		self.failUnlessAlmostEqual(distribution['A'], correct_distribution['A'])
		self.failUnlessAlmostEqual(distribution['B'], correct_distribution['B'])
コード例 #5
0
ファイル: nbscript_dir.py プロジェクト: liangxh/idu
def main():
	optparser = OptionParser()

	# necessary
	optparser.add_option('-p', '--prefix', action='store', type = 'str', dest='prefix')
	optparser.add_option('-x', '--dir_x', action='store', type = 'str', dest='dir_x')
	optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim')

	optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.)

	# debug
	optparser.add_option('-n', '--n_samples', action='store', dest='n_samples', default = None)

	opts, args = optparser.parse_args()
	
	#################### Preparation of Input ##############
	print >> sys.stderr, 'lstmscript.run: [info] loading dataset ... ', 
	
	n_emo = opts.ydim
	datalen = opts.n_samples		
	dataset = datica.load_data(opts.dir_x, opts.ydim, datalen) 

	print >> sys.stderr, 'Done'

	def merge_train_valid(dataset):
		train, valid, test = dataset
		tx, ty = train
		vx, vy = valid
		tx.extend(vx)
		ty.extend(vy)
		return (tx, ty), test

	dataset = merge_train_valid(dataset)
	train, test = dataset

	classifier = NaiveBayesClassifier()
	classifier.train(train[0], train[1], opts.value_k)
	preds = [classifier.classify(x) for x in test[0]]

	fname_test = 'data/dataset/test/%s_test.pkl'%(opts.prefix)
	fname_valid = 'data/dataset/test/%s'%(opts.prefix)

	cPickle.dump((test[1], preds), open(fname_test, 'w'))
	validatica.report(test[1], preds, fname_valid)
コード例 #6
0
ファイル: nbscript.py プロジェクト: liangxh/idu
def main():
	optparser = OptionParser()

	# necessary
	optparser.add_option('-p', '--prefix', action='store', type = 'str', dest='prefix')
	optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.)
	optparser.add_option('-u', '--unigram', action='store_true', dest='unigram', default = False)
	optparser.add_option('-d', '--deduplicate', dest='flag_deduplicate', action = 'store_true', default = False)

	# debug
	optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim', default = N_EMO)
	optparser.add_option('-n', '--n_samples', action='store', dest='n_samples', default = None)

	opts, args = optparser.parse_args()

	if opts.unigram:
		dataset = datica.load_unigram(opts.ydim, opts.n_samples)
	else:
		dataset = datica.load_token(opts.ydim, opts.n_samples)

	def merge_train_valid(dataset):
		train, valid, test = dataset
		tx, ty = train
		vx, vy = valid
		tx.extend(vx)
		ty.extend(vy)
		return (tx, ty), test

	dataset = merge_train_valid(dataset)
	train, test = dataset

	classifier = NaiveBayesClassifier()
	classifier.train(train[0], train[1], opts.value_k, opts.flag_deduplicate)
	
	preds = [classifier.classify(x) for x in train[0]]
	prec = validatica.precision_at_n(train[1], preds)
	print prec
コード例 #7
0
    def test_single_training_data(self):
        classifier = NaiveBayesClassifier()
        classifier.train((('A', 'a'), ))

        self.failUnless(classifier.label('a') == 'A')
        distribution = classifier.label_distribution('a')
        self.failUnlessEqual(len(distribution), 1)
        self.failUnless('A' in distribution)
        self.failUnless(distribution['A'] == 0.0, distribution)
コード例 #8
0
ファイル: rulewizard.py プロジェクト: maripeza/grien-dour
    def find_best_rules(self):
        '''
        Will use the train_list of this class instance to create a temporary classifier
        that is used to find the most informative features of the training set. This
        feature list is stored in self.best_feature_list, and "iterated" with 
        self.current_best_feature.
        '''
        train_set = [(word_features(word), outcome) for (word, outcome) in self.train_list]
        classifier = NaiveBayesClassifier.train(train_set)
        sorted_feature_list = [i for i in classifier.show_most_informative_features(10000)]
        #Basically, sort features according to probability.
        sorted_feature_list.sort(key=lambda feature: feature[1], reverse=True)
        self.best_feature_list = [i[0] for i in sorted_feature_list]

        #create letter intersection set to find common letters in the train list
        for i in self.train_words:
            self.letter_set = set(i).intersection(self.letter_set)
コード例 #9
0
    def test_single_class_mixed_training_data(self):
        classifier = NaiveBayesClassifier()
        classifier.train((('A', 'a'), ('A', 'a'), ('B', 'a')))

        self.failUnless(classifier.label('a') == 'A')
        distribution = classifier.label_distribution('a')
        self.failUnlessEqual(len(distribution), 2)
        self.failUnless('A' in distribution)

        correct_distribution = Counter()
        correct_distribution['A'] = (2.0 / 3.0)**3
        correct_distribution['B'] = (1.0 / 3.0)**3
        correct_distribution.normalize()
        correct_distribution.log()

        self.failUnlessAlmostEqual(distribution['A'],
                                   correct_distribution['A'])
        self.failUnlessAlmostEqual(distribution['B'],
                                   correct_distribution['B'])
コード例 #10
0
ファイル: senti.py プロジェクト: g625731556/NLP
from naivebayes import NaiveBayesClassifier
import os
import re
import codecs
from segmentor import Segmentor

def corpus_generator(segmentor):
    for corpus in map(lambda x: "sentiment_corpus/" + x, ["Ctrip_htl_ba_4000", "Dangdang_Book_4000", "Jingdong_NB_4000"]):
        classes = filter(lambda x: x[0] != ".", os.listdir(corpus))
        for cls in classes:
            print "Enumerating for '%s/%s' reviews." % (corpus, cls)
            cls_dir = os.path.join(corpus, cls)
            files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir))
            for filename in files:
                with codecs.open(os.path.join(cls_dir, filename), "r", encoding="utf8") as file:
                    for line in file:
                        if not line.strip():
                            continue
                        words = segmentor(line.strip())
                        yield (cls, words)

segmentor = Segmentor()
generator = corpus_generator(segmentor)
classifier = NaiveBayesClassifier()
classifier.train(generator)

print classifier.classify(segmentor(u"这一地区生鲜奶收购价持续在低位徘徊,导致很多奶户入不敷出,被迫“砍牛”(杀牛或卖牛)。 近期,双鸭山市多地奶农联名向记者反映"))

# print classifier.classify("This is awesome but still I don't like it thisisaweirdwordneveroccurs. ".split(" "))
# print classifier.classify("iqbvajkkjbarjta".split(" "))
# print classifier.classify("I don't recommend.".split(" "))
コード例 #11
0
ファイル: movies.py プロジェクト: g625731556/NLP
from naivebayes import NaiveBayesClassifier
import os
import re


def review_generator(dir):
    classes = os.listdir(dir)
    for cls in classes:
        print "Enumerating for '%s' reviews." % cls
        cls_dir = os.path.join(dir, cls)
        files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir))
        for filename in files:
            with open(os.path.join(cls_dir, filename), "r") as file:
                for line in file:
                    words = line.split()
                    words = filter(lambda x: re.match(r'^\w{3,}$', x), words)
                    yield (cls, words)


generator = review_generator("txt_sentoken")
classifier = NaiveBayesClassifier()
classifier.train(generator)

print classifier.classify(
    "This is awesome but still I don't like it thisisaweirdwordneveroccurs. ".
    split(" "))
print classifier.classify("iqbvajkkjbarjta".split(" "))
print classifier.classify("".split(" "))
コード例 #12
0
ファイル: movies.py プロジェクト: SigmaQuan/NLP
from naivebayes import NaiveBayesClassifier
import os
import re

def review_generator(dir):
    classes = os.listdir(dir)
    for cls in classes:
        print "Enumerating for '%s' reviews." % cls
        cls_dir = os.path.join(dir, cls)
        files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir))
        for filename in files:
            with open(os.path.join(cls_dir, filename), "r") as file:
                for line in file:
                    words = line.split()
                    words = filter(lambda x: re.match(r'^\w{3,}$', x), words)
                    yield (cls, words)

generator = review_generator("txt_sentoken")
classifier = NaiveBayesClassifier()
classifier.train(generator)

print classifier.classify("This is awesome but still I don't like it thisisaweirdwordneveroccurs. ".split(" "))
print classifier.classify("iqbvajkkjbarjta".split(" "))
print classifier.classify("".split(" "))