def _saveTaskResults(self, my_web): #只过滤包含正文的网页 str = '.*\w{16}\.((html)|(shtml))' url, pageSource = my_web.getDatas() r = re.search(str, url) if r is not None: soup = BeautifulSoup(pageSource) if soup.h2 is not None: title = unicode(soup.h2.string) elif soup.p is not None: title = unicode(soup.p.string) else: title = 'no title' text = '' for i in soup.find_all('p'): text += unicode(i.get_text()) #tmp = trieKmp.gao(title + text) t1 = trieKmp.gao(title) t2 = trieKmp.gao(text) tmp = [] for i in xrange(len(t1)): if t1[i] != '0': tmp.append('9') else: tmp.append(t2[i]) res = ''.join(tmp) #print 'res=', res # print 'text=', text, 'tmp=', tmp # print 'tmp=', tmp self.database.saveData(url, title, text[: 40], res) return 0
def _saveTaskResults(self, my_web): #只过滤包含正文的网页 str = '.*\w{16}\.((html)|(shtml))' url, pageSource = my_web.getDatas() r = re.search(str, url) if r is not None: soup = BeautifulSoup(pageSource) if soup.h2 is not None: title = unicode(soup.h2.string) elif soup.p is not None: title = unicode(soup.p.string) else: title = 'no title' text = '' for i in soup.find_all('p'): text += unicode(i.get_text()) #tmp = trieKmp.gao(title + text) t1 = trieKmp.gao(title) t2 = trieKmp.gao(text) tmp = [] for i in xrange(len(t1)): if t1[i] != '0': tmp.append('9') else: tmp.append(t2[i]) res = ''.join(tmp) #print 'res=', res # print 'text=', text, 'tmp=', tmp # print 'tmp=', tmp self.database.saveData(url, title, text[:40], res) return 0
from AC import trieKmp from database import Database reload(sys) sys.setdefaultencoding('utf-8') data = Database('data.db') h = data.find_a() N = len(h) #print len(h) while True: s = raw_input('input the keywords:') query = unicode(s) d = trieKmp.gao(query) fc = [] for i in xrange(trieKmp.num): if d[i] == '0': continue fc.append(i) M = len(fc) print '分词结果为: ', for ch in fc: print trieKmp.word[ch], print '' g = [] for i in xrange(N): cnt = 0 for j in fc: cnt += int(h[i][1][j])