Beispiel #1
0
def main():
    cl = docclass.classifier(docclass.getwords)
    cl.setdb('test1.db')
    docclass.sampletrain(cl)
    print cl.fprob('quick', 'good')
    print cl.weighted_prob('money', 'good', cl.fprob)
    docclass.sampletrain(cl)
    print cl.weighted_prob('money', 'good', cl.fprob)

    clnb = docclass.naivebayes(docclass.getwords)
    clnb.setdb('test1.db')
    docclass.sampletrain(clnb)
    print clnb.prob('quick rabbit', 'good')
    print clnb.prob('quick rabbit', 'bad')
    print clnb.classify('quick rabbit', default='unknown')
    print clnb.classify('quick money', default='unknown')
    clnb.setthreshold('bad', 3.0)
    print clnb.classify('quick money', default='unknown')

    clfs = docclass.fisherclassifier(docclass.getwords)
    clfs.setdb('test1.db')
    docclass.sampletrain(clfs)
    print clfs.cprob('quick', 'good')
    print clfs.cprob('money', 'bad')
    print clfs.weighted_prob('money', 'bad', clfs.cprob)
    print clfs.fisherprob('quick rabbit', 'good')
    print clfs.fisherprob('quick rabbit', 'bad')
    print clfs.classify('quick rabbit')
    print clfs.classify('quick money')

    clfs2 = docclass.fisherclassifier(docclass.getwords)
    clfs2.setdb('test1.db')
    feedclassifier('feed_sample2.rss', clfs2)
    print clfs2.cprob('Pandas', 'python')
    print clfs2.cprob('python', 'python')
Beispiel #2
0
 def testProb(self):
   cl = docclass.fisherclassifier(docclass.getwords)
   cl.setdb('test.db')
   docclass.sampletrain(cl)
   self.assertAlmostEquals(0.57142857, cl.cprob('quick', 'good'))
   self.assertAlmostEquals(0.78013987, cl.fisherprob('quick rabbit', 'good'))
   self.assertAlmostEquals(0.35633596, cl.fisherprob('quick rabbit', 'bad'))
Beispiel #3
0
def train_fisher(train_path, test_path):
    cl = docclass.fisherclassifier(docclass.getwords)

    for filename in glob.glob(train_path):
        with open(filename, 'r') as f:
            f = f.read()
            label = filename.split('.')[3]
            cl.train(f, label)

    print "Train Done!"

    TP = 0.0
    FN = 0.0
    FP = 0.0
    TN = 0.0

    for filename in glob.glob(test_path):
        with open(filename, 'r') as f:
            f = f.read()
            label = filename.split('.')[3]
            predict = cl.classify(f)
            if label == 'spam' and predict == 'spam':
                TP += 1
            elif label == 'spam' and predict == 'ham':
                FN += 1
            elif label == 'ham' and predict == 'spam':
                FP += 1
            elif label == 'ham' and predict == 'ham':
                TN += 1
            else:
                print predict, label

    print "Test Done!"

    score(TP, FN, FP, TN)
def test_fisher_weightedprob():
    sys.stderr.write("testing computation of fisher weightedprob...\n")
    reload(docclass)
    cl=docclass.fisherclassifier(docclass.getwords)
    docclass.sampletrain(cl)
    wp = cl.weightedprob('money', 'bad', cl.cprob)
    sys.stdout.write("%f\n" %(wp)) # 0.75
Beispiel #5
0
def main():
	# table = string.maketrans("","")
	# s = '从满脸痘痘到细腻皮肤的蜕变,大S及皮肤科医生都推荐的修复面膜,[ 围观]解决皮肤的多种问题~点击查看详情:http://t.cn/zHFnve4'
	# for x in getWords(s):
	# 	print x
	# s.translate(table, string.punctuation+extra_punctuation)
	# regxs = {r'\[\S+?\]': ''}
	# for key,value in regxs.items():
	# 	print key, value
	# with open("test.txt", "r") as f:
		# consumer_key,consumer_secret,key,secret,userid = f.readlines()[0].strip().split(' ')
	# print consumer_key,consumer_secret,key,secret,userid
	# run_crawler(consumer_key,consumer_secret,key,secret,'1986653865')
	# weibo = Sina_master(consumer_key,consumer_secret)
	# weibo.setToken(key, secret)
	# weibo.manage_access()
	# info = weibo.get_latest_weibo(count=5, user_id=userid)
	# reptile(sina_reptile,userid)
	# sina_reptile.connection.close()
	# for x in info:
	# 	print x
	# 	print x['geo']['city']
		# print x['text']
		# words =getWords(x['text'])
		# # print x['text']
		cl = docclass.fisherclassifier(docclass.getWords)
		cl.setdb('statuses.db')
		# print cl.cprob('幸福', 'test')
		# print cl.fisherprob('幸福', 'test')
		# cl.train(x, 'test;up;kill;volite')
		dic =cl.classifypercent(input_)
		print sorted(dic.items(), key=lambda e:e[1], reverse=True)
def analyzeResults(settings, resuls):
    # database = FeedDatabase(settings['database'])
    classifier = fisherclassifier(getwords)
    classifier.setdb(settings["database"])
    counter = 0
    size = len(resuls.keys())
    for resul in resuls.keys():
        # sys.stderr.write()
        # Get the actual category and description of the blog entry
        query = "SELECT actualcategory,description,title FROM feeds WHERE guid='{0}'".format(resuls[resul]["guid"])
        query = classifier.con.execute(query).fetchone()
        actCatgory = query[0]
        descrip = query[1]

        # print(query)
        # print(qResult[0])
        cProb = -1
        fProb = -1
        # print(descrip)
        # print(cprob)
        fProb = classifier.fisherprob(descrip, actCatgory)

        guid = resuls[resul]["guid"]
        predictedCategory = resuls[resul]["category"]
        tit = unicodedata.normalize("NFKD", query[2]).encode("ascii", "ignore")
        tit = tit.replace("&", "\&")
        # print('{0}|{1}|{2}|{3}'.format(guid,predictedCategory,actCatgory,fProb))
        print("{0} & {1} & {2} & {3} \\\\\\hline".format(tit, predictedCategory, actCatgory, fProb))
        # classifier.con.execute('''INSERT INTO predictedEntries VALUES (?,?,?,?)''',(guid,predictedCategory,fProb,cprob))
        # classifier.con.commit()

        # print(len(qResult))
    qry = ""
    report = [{""}]
 def guess_the_prof(self):
     if self.method=='naive':
         cl=docclass.naivebayes(docclass.getwords)
         prof_sel=self.box.get() # This is the professor whose department we want to guess
         doc_of_prof=self.profs_data[prof_sel]
         self.trainer(prof_sel,cl)
         all_thresh=self.lb.get(0,END)
         thresholds=[]
         for item in all_thresh:
             merged=item.split('-')
             threshnum=float(merged[0])
             thresh=merged[1]
             thresholds.append((thresh,threshnum))
         for thr,num in thresholds:
             cl.setthreshold(thr,num)
         self.pdep= cl.classify(doc_of_prof,default='unknown')
     else:
         cl=docclass.fisherclassifier(docclass.getwords)
         prof_sel=self.box.get() # This is the professor whose department we want to guess
         doc_of_prof=self.profs_data[prof_sel]
         self.trainer(prof_sel,cl)
         all_thresh=self.lb.get(0,END)
         thresholds=[]
         for item in all_thresh:
             merged=item.split('-')
             threshnum=float(merged[0])
             thresh=merged[1]
             thresholds.append((thresh,threshnum))
         for thr,num in thresholds:
             cl.setminimum(thr,num)
         self.pdep= cl.classify(doc_of_prof,default='unknown')
     self.verdict()
Beispiel #8
0
def train3():
    import docclass as docclass
    cl2 = docclass.fisherclassifier(docclass.getwords)
    cl2.setdb('test3.db')
    for a in range(2000):
        docclass.sampletrain(cl2)
    cl2.con.commit()
def test_fisher_cprob():
    sys.stderr.write("testing computation of fisher cprob...\n")
    reload(docclass)
    cl=docclass.fisherclassifier(docclass.getwords)
    docclass.sampletrain(cl)
    cp1 = cl.cprob('quick', 'good')
    sys.stdout.write("%f\n" %(cp1)) # 0.57142857...
    cp2 = cl.cprob('money', 'bad')
    sys.stdout.write("%f\n" %(cp2)) # 1.0
def myFisherModelInTrainingAndTesting(
    trainingInputFileName, entriesXMLFileName, dbFileName, mode, maxItems, getWordGetEntryMethod="getWord"
):

    if (mode == "train" or mode == "test") and (
        getWordGetEntryMethod == "getWord" or getWordGetEntryMethod == "getEntry"
    ):

        if len(trainingInputFileName) > 0 and len(entriesXMLFileName) > 0 and len(dbFileName) > 0 and maxItems > 0:

            if getWordGetEntryMethod == "getWord":
                cl = docclass.fisherclassifier(docclass.getwords)
            else:
                cl = docclass.fisherclassifier(feedfilter.entryfeatures)

            cl.setdb(dbFileName)
            feedfilter.nonInteractiveRead(
                entriesXMLFileName, cl, trainingInputFileName, mode, maxItems, getWordGetEntryMethod
            )
def test_fisher_fisherprob():
    sys.stderr.write("testing computation of fisher fisherprob...\n")
    reload(docclass)
    cl=docclass.fisherclassifier(docclass.getwords)
    docclass.sampletrain(cl)
    # cprob
    cp = cl.cprob('quick', 'good')
    sys.stdout.write("%f\n" %(cp)) # 0.57142857...
    # fisher prob
    fp1 = cl.fisherprob('quick rabbit', 'good')
    sys.stdout.write("%f\n" %(fp1)) # 0.780139
    fp2 = cl.fisherprob('quick rabbit', 'bad')
    sys.stdout.write("%f\n" %(fp2)) # 0.356335
Beispiel #12
0
  def testClassify(self):
    cl = docclass.fisherclassifier(docclass.getwords)
    cl.setdb('test.db')
    docclass.sampletrain(cl)

    self.assertEquals('good', cl.classify('quick rabbit', default='unknown'))
    self.assertEquals('bad', cl.classify('quick money', default='unknown'))

    cl.setminimum('bad', 0.8)
    self.assertEquals('good', cl.classify('quick money', default='unknown'))

    cl.setminimum('bad', 0.4)
    self.assertEquals('bad', cl.classify('quick money', default='unknown'))
Beispiel #13
0
def myFisherModelInTrainingAndTesting(trainingInputFileName,
                                      entriesXMLFileName,
                                      dbFileName,
                                      mode,
                                      maxItems,
                                      getWordGetEntryMethod='getWord'):

    if ((mode == 'train' or mode == 'test')
            and (getWordGetEntryMethod == 'getWord'
                 or getWordGetEntryMethod == 'getEntry')):

        if (len(trainingInputFileName) > 0 and len(entriesXMLFileName) > 0
                and len(dbFileName) > 0 and maxItems > 0):

            if (getWordGetEntryMethod == 'getWord'):
                cl = docclass.fisherclassifier(docclass.getwords)
            else:
                cl = docclass.fisherclassifier(feedfilter.entryfeatures)

            cl.setdb(dbFileName)
            feedfilter.nonInteractiveRead(entriesXMLFileName, cl,
                                          trainingInputFileName, mode,
                                          maxItems, getWordGetEntryMethod)
def train_classifier(settings,trainingData):
	
	counter = 0
	size = len(trainingData)
	database = FeedDatabase(settings['database'])

	for key in trainingData.keys():
		database.change_classified(key,classified=True)
	database.close_database()


	classifier = fisherclassifier(getwords)
	classifier.setdb(settings['database'])
	for key in trainingData.keys():
		sys.stderr.write('...Training ({0}/{1})...\n'.format(counter,size))
		classifier.train(trainingData[key]['description'],trainingData[key]['category'])
		counter +=1
	sys.stderr.write('...Finished Training Classifier\n')
Beispiel #15
0
    def Fisher_prediction(self):
        cll = docclass.fisherclassifier(docclass.getwords)
        for category in self.list_of_department:  #Category=department
            for teacher in self.dictionary_of_department_and_professor[
                    category]:
                if teacher == self.combovar.get():
                    continue
                else:
                    cll.train(
                        self.dictionary_as_database[teacher], category
                    )  #self.dictionary_as_database[teacher]=professor's information
        for item in self.list_of_thresholds:
            department, score = item
            cll.setminimum(department, score)

        prediction = cll.classify(
            self.dictionary_as_database[self.combovar.get()], default=None)
        self.help_to_write(prediction)
def test_fisher_classify():
    sys.stderr.write("testing fisher classification...\n")
    reload(docclass)
    cl=docclass.fisherclassifier(docclass.getwords)
    docclass.sampletrain(cl)
    # classify
    c1 = cl.classify('quick rabbit')
    sys.stdout.write("%s\n" %(c1)) # 'good'
    c2 = cl.classify('quick money')
    sys.stdout.write("%s\n" %(c2)) # 'bad'
    # set minimum for 'bad'
    cl.setminimum('bad', 0.8)
    c3 = cl.classify('quick money')
    sys.stdout.write("%s\n" %(c3)) # 'good'
    # set minimum for 'good'
    cl.setminimum('good', 0.4)
    c4 = cl.classify('quick money')
    sys.stdout.write("%s\n" %(c4)) # 'good'
Beispiel #17
0
def main():
	cl = docclass.fisherclassifier(docclass.getWords)
	cl.setdb('statuses.db')
	with open("test.txt", "r") as f:
		consumer_key,consumer_secret,key,secret,userid = f.readlines()[0].strip().split(' ')
	weibo = Sina_master(consumer_key,consumer_secret)
	weibo.setToken(key, secret)
	info = weibo.get_latest_weibo(count=50, user_id="1906168267")#1986653865 #1794530900

	with open(cache_file, 'w') as f:
		for x in info:
			p= cl.classifypercent(x['text'])
			f.write(json.dumps(p)+"\n")

	with open(cache_file, "rb") as f:
		dic= user_line(f.readlines())
	print sorted(dic.items(), key=lambda e:e[1], reverse=True)
	os.remove(cache_file)
Beispiel #18
0
def classifyEntries(settings):
	database = FeedDatabase(settings['database'])
	unclassifiedEntries = database.get_unpredicted_entries()
	#for i in unclassifiedEntries:
	#	print(i)
	#print(len(unclassifiedEntries))
	database.close_database()

	classifier = fisherclassifier(getwords)
	classifier.setdb(settings['database'])
	counter = 0
	size = len(unclassifiedEntries)
	results = []
	for entr in unclassifiedEntries:
		a = open('script50.txt','w+')
		for i in results:
			a.write('{0}|{1}\n'.format(i['guid'],i['category']))
		a.close()
		category = classifier.classify(entr['description'])
		#print('{0}|{1}'.format(entr['guid'],category))
		results.append({'guid':entr['guid'],'category':category})

		counter += 1 
		sys.stderr.write('...Classified {0} of {1} entries\n'.format(counter,size))
Beispiel #19
0
#!/usr/bin/python
import feedfilter
import docclass
import feedparser
import randomGram
import unidecode
#docclass.getwords("this")
myclassifier = docclass.fisherclassifier(docclass.getwords)
myclassifier.setdb('CSblogfeed.db')
#feedfilter.read('CSblogfeed.xml',myclassifier)

#pull in correct answers
file = open('correct.dat', 'r')
correctf = file.read()
file.close()
correct = correctf.split('\n')

#for line in correct:
#    print line
#del correct[-1]
#print len(correct)
#print type(correct)
#print correctf
#for item in correctf:
#    print item
f = feedparser.parse('CSblogfeed.xml')
count = 0
for entry in f['entries']:
    #train using 50
    if count != 90:
        #        print count," ",correct[count]," ",entry['title']
Beispiel #20
0
def main():
  cl=docclass.fisherclassifier(docclass.getwords)
  cl.setdb('allsports3.db')
  read('allsports.xml',cl)
Beispiel #21
0
import docclass as d

cl = d.fisherclassifier(d.getwords)
d.sampletrain(cl)
print cl.classify('quick rabbit')
print cl.classify('quick money')
cl.setminimum('bad', 0.8)
print cl.classify('quick money')
cl.setminimum('good', 0.4)
print cl.classify('quick money')

for i in range(10):
    d.sampletrain(cl)
print cl.classify('quick money')
Beispiel #22
0
    f = feedparser.parse(feed)
    for entry in f['entries']:
        print
        print '----'
        print 'Title:     ' + entry['title'].encode('utf-8')
        print 'Publisher: ' + entry['publisher'].encode('utf-8')
        print
        print entry['summary'].encode('utf-8')

        fulltext = '%s\n%s\n%s' % (entry['title'], entry['publisher'],
                                   entry['summary'])

        #print 'Guess: ' + str(classifier.classify(fulltext))

        #cl = raw_input('Enter category: ')
        #classifier.train(fulltext, cl)

        print 'Guess: ' + str(classifier.classify(entry))

        cl = raw_input('Enter category: ')
        classifier.train(entry, cl)


if __name__ == '__main__':
    import docclass

    #cl = docclass.fisherclassifier(docclass.getwords)
    cl = docclass.fisherclassifier(entryfeatures)
    cl.setdb('python_feed.db')
    read('python_search.xml', cl)
Beispiel #23
0
def main():
  cl=docclass.fisherclassifier(docclass.getwords)
  cl.setdb('dpaladhi.db')
  read('my_data.xml',cl)
Beispiel #24
0
			fisherclassifier.train(fulltext,temp)
		else:
			value1 = str(fisherclassifier.classify(fulltext))
			print(value1)
			actual.append(int(temp))
		print()
	return actual
def readVector(filename):
	lines=[]
	for line in open(filename):
		lines.append(line)
	colnames=lines[0].strip().split('\t')[1:]
	rownames=[]
	data=[]
	for line in lines[1:]:
		p=line.strip().split('\t')
		rownames.append(p[0])
		data.append([float(x) for x in p[1:]])
	return rownames,colnames,data
c2=docclass.fisherclassifier(docclass.getwords)
blognames,words,data=readVector('blogdata1.txt')
Yvalue = readfile("http://superchicken46.blogspot.com/feeds/posts/default?max-results=100&alt=rss", c2)
X_digits = np.array(data)
Y_digits = np.array(Yvalue)
clf = svm.SVC(kernel='linear', C=10)
clf.fit(X_digits, Y_digits)
scores = cross_validation.cross_val_score(clf, X_digits, Y_digits, cv = 10)
print(scores.mean())
for i in scores:
	print("Value:", i)
Beispiel #25
0
    summarywords = [s.lower() for s in splitter.split(entry['summary'])
                    if len(s) > 2 and len(s) < 20]

    # 统计大写字母
    uc = 0
    for i in range(len(summarywords)):
        w = summarywords[i]
        f[w] = 1
        if w.isupper():
            uc += 1

        # 将从摘要中获得的词组作为特征
        if i < len(summarywords)-1:
            twowords = ' '.join(summarywords[i:i+1])
            f[twowords] = 1

    # 保持文章创建者和发布者名字的完整性
    f['Publisher:' + entry['publisher']] = 1

    # UPPERCASE 是一个“虚拟”单词,用以指示存在过多的大写内容
    if float(uc) / len(summarywords) > 0.3:
        f['UPPERCASE'] = 1

    return f

if __name__ == '__main__':
    # cl = docclass.fisherclassifier(docclass.getwords)
    cl = docclass.fisherclassifier(entryfeatures)
    cl.setdb('python_feed.db')
    feedread('python_feed.xml', cl)
Beispiel #26
0
def main():
    cl = docclass.fisherclassifier(docclass.getwords)
    cl.setdb('bbokka.db')
    print "testing the program"
    feedfilter.read('test.xml', cl)
Beispiel #27
0
 def testOneCategory(self):
   cl = docclass.fisherclassifier(docclass.getwords)
   cl.setdb('test.db')
   cl.train('hallo hallo', 'greeting')
   self.assertEquals('greeting', cl.classify('hallo world'))
Beispiel #28
0
    for i in range(len(summarywords)):
        w = summarywords[i]
        f[w] = 1
        if w.isupper():
            uc += 1

        # 将从摘要中获得的词组作为特征
        if i < len(summarywords) - 1:
            twowords = ' '.join(summarywords[i:i + 1])
            f[twowords] = 1

    # 保持文章创建者和发布者名字的完整性
    f['Publisher:' + entry['publisher']] = 1
    # UPPERCASE是一个“虚拟”单词,用以指示存在过多的大写内容
    if float(uc) / len(summarywords) > 0.3:
        f['UPPERCASE'] = 1
    '''
    # 将当前分类的最佳推测结果打印输出
    print 'Guess: ' + str(classifier.classify(entry))

    # 请求用户给出正确分类, 并据此进行训练
    c1 = raw_input('Enter category: ')
    classifier.train(entry, c1)
    '''

    return f


c1 = fisherclassifier(entryfeatures)
read('../data/python_search.xml', c1)
Beispiel #29
0
    uc = 0
    for i in range(len(summarywords)):
        w = summarywords[i]
        features[w] = 1
        if w.isupper(): uc += 1

        # 将从摘要中获得词组作为特征
        if i < len(summarywords) - 1:
            twowords = ' '.join(summarywords[i:i + 1])
            features[twowords] = 1

    # 保持文章创建者和发布者名字的完整性
    features['Publisher:' + entry['publisher']] = 1

    # UPPERCASE是一个“虚拟”单词,用以指示存在过多的大写内容
    if float(uc) / len(summarywords) > 0.3: features['UPPERCASE'] = 1

    return features


if __name__ == "__main__":  #只有在执行当前模块时才会运行此函数
    # 对博客文章进行分类和训练
    cl = docclass.fisherclassifier(docclass.getwords)
    cl.setdb('python_feed.db')
    read('python_search.xml', cl)

    # 使用改进的特征提取函数对文章分类进行处理
    cl = docclass.fisherclassifier(entryfeatures)
    cl.setdb('python_feed.db')
    read('python_search.xml', cl)
Beispiel #30
0
urls = (
	  '/api/mining', 'Mining',
	  '/api/pro', 'Resouce',
	  '/api/traindata', 'Traindata',
    '/.*' , 'Index',
)

with open("test.txt", "r") as f:
	consumer_key,consumer_secret,key,secret,userid = f.readlines()[0].strip().split(' ')
render = web.template.render('templates')

app = web.application(urls, globals())

db = web.database(dbn="sqlite", db=conf.db_name)
cl = docclass.fisherclassifier(docclass.getWords)
cl.setdb(conf.db_name)
weibo = Sina_master(consumer_key,consumer_secret)
weibo.setToken(key, secret)
res = open(conf.pro_path).read()

class Index:
	def GET(self):
		i = web.input(pageIndex=1, pageSize=5)
		'''select id, text from statuses order by id limit ? , ?'''
		posts = db.query('select id, text from statuses where status=0 order by id limit $pageIndex , $pageSize', \
			vars={'pageIndex': (int(i.pageIndex)-1)*int(i.pageSize), 'pageSize': i.pageSize})
		count = db.select('statuses', what='count(*) total_num', where=' status=$status', vars={'status': 0})
		# print 'val:%d'%int(count.c)
		total_num = count[0].total_num   
		c = total_num/int(i.pageSize)
def testingModel(dictionaryOfTitleAndClass):
	cl=docclass.fisherclassifier(docclass.getwords)
	cl.setdb('politics_feed.db') # Only if you implemented SQLite
	feedfilter.readNonInteractiveTesting(dictionaryOfTitleAndClass,'politics_search2.xml',cl)
Beispiel #32
0
def main():
    cl=docclass.fisherclassifier(docclass.getwords) 
    cl.setdb('smajeti.db')
    print "testing the program"
    feedfilter.read('toiEntertainment.xml',cl)
#!/usr/bin/python2.7
# _*_ coding: utf-8 _*_

import docclass as ori
import os

c1 = ori.fisherclassifier(ori.getwords)
c1.setdb('test1.db')
'''
def getrate(self)
    right=sum(self.con.execute('select count from cc where wr=1',(cat,wr)).fetchall().value())
    wrong=sum(self.con.execute('select count from cc where wr=0',(cat,wr)).fetchall().value())
    rate=right/(right+wrong)
    return rate
'''


def doctest(cl):
    right = 0.0
    wrong = 0.0
    dir = os.getcwd()
    dirham = dir + r'\data_set\hw1_data\test\ham'
    # print dirham+'\n'
    dirspam = dir + r'\data_set\hw1_data\test\spam'
    list1 = ori.GetFileList(dirspam, [])
    list2 = ori.GetFileList(dirham, [])
    # print list1+'\n'
    for item in list1:
        f = open(item)
        words = f.read()
        #words = textParser(f)