def get_data(): ''' Get the training and text datasets from local folds Positive and negative datasets were stored in different folds When loading the datasets , do sentences segmentation with smallseg tool ''' posPath = '/home/zhouxc/skindetector/AdultWebsiteText/' negPath = '/home/zhouxc/skindetector/NormalWebsiteText/' posFiles = os.listdir(posPath) negFiles = os.listdir(negPath) trainingData = [] seg = SEG() seg.set(dic) c = 0 print '---------------------Read Positive DataSet-----------------' for fileName in posFiles: #if c > 100: break c += 1 print "PositiveData" + str(c) path = posPath + fileName data = seg.cut(open(path).read()) text = [ word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict ] trainingData.append((text, 'Positive')) print '---------------------Positive DataSet done-----------------' c = 0 print '---------------------Read Negative DataSet-----------------' for fileName in negFiles: #if c > 100: break c += 1 print "NegativeData" + str(c) path = negPath + fileName data = seg.cut(open(path).read()) text = [ word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict ] trainingData.append((text, 'Negative')) print '--------Negative DataSet done-----------------------------------' return trainingData, trainingData
def get_data(): ''' Get the training and text datasets from local folds Positive and negative datasets were stored in different folds When loading the datasets , do sentences segmentation with smallseg tool ''' posPath = '/home/zhouxc/skindetector/AdultWebsiteText/' negPath = '/home/zhouxc/skindetector/NormalWebsiteText/' posFiles = os.listdir(posPath) negFiles = os.listdir(negPath) trainingData = [] seg = SEG() seg.set(dic) c = 0 print '---------------------Read Positive DataSet-----------------' for fileName in posFiles: #if c > 100: break c += 1 print "PositiveData" + str(c) path = posPath + fileName data = seg.cut(open(path).read()) text = [word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict] trainingData.append((text , 'Positive')) print '---------------------Positive DataSet done-----------------' c = 0 print '---------------------Read Negative DataSet-----------------' for fileName in negFiles: #if c > 100: break c += 1 print "NegativeData" + str(c) path = negPath + fileName data = seg.cut(open(path).read()) text = [word.encode('utf-8') for word in data if word.encode('utf-8') in pornDict] trainingData.append((text , 'Negative')) print '--------Negative DataSet done-----------------------------------' return trainingData , trainingData
+ str(i) \ + '''')" href="'''\ + url \ + '''" target="_blank"><font size="3">''' \ + arrowscript \ + title \ + '''</font></a><br /><font size="-1">''' \ + snippet \ + '''<br /><font color="#008000">''' \ + url \ + '''<br /></font></font></td></tr></table>\n''' pageStr += resultStr i += 1 return pageStr if __name__ == '__main__': #resultsList = ["我是中国人民的儿子", "你是我儿子", "中国人民万岁", "我永远是中国人民的儿子"] seg = SEG() #print 'Load dict...' words = "main.dic" seg.set(words) #print "Dict is OK." #print psudorerank(resultsList, 2) username = "******" engine = request.GET.get("engine", "") resultsTable = ResultInfoTable[engine] [query, pagecontent] = userFeedbackRerank(username, resultsTable, seg)
#encoding=utf-8 try: import psyco psyco.full() except: pass s3 = file("text.txt").read() words = [x.rstrip() for x in file("main.dic") ] from smallseg import SEG seg = SEG() print 'Load dict...' seg.set(words) print "Dict is OK." from time import time for i in xrange(1,101): start = time() for j in xrange(0,i): A = seg.cut(s3) cost = time()-start print i,"times, cost:",cost print "********************************"