コード例 #1
0
ファイル: crawler.py プロジェクト: liangsheng/crawler_my
 def _saveTaskResults(self, my_web):
     #只过滤包含正文的网页
     str = '.*\w{16}\.((html)|(shtml))'
     url, pageSource = my_web.getDatas()
     r = re.search(str, url)
     if r is not None:        
        soup = BeautifulSoup(pageSource)
        if soup.h2 is not None:
            title = unicode(soup.h2.string)
        elif soup.p is not None:
            title = unicode(soup.p.string)
        else:
            title = 'no title'
        text = ''
        for i in soup.find_all('p'):
           text += unicode(i.get_text())           
        #tmp = trieKmp.gao(title + text)
        t1 = trieKmp.gao(title)
        t2 = trieKmp.gao(text)
        tmp = []
        for i in xrange(len(t1)):
            if t1[i] != '0':
                tmp.append('9')
            else:
                tmp.append(t2[i])
        res = ''.join(tmp)
        #print 'res=', res          
       # print 'text=', text, 'tmp=', tmp
       # print 'tmp=', tmp
        self.database.saveData(url, title, text[: 40], res)
     return 0
コード例 #2
0
ファイル: crawler.py プロジェクト: liangsheng/crawler_my
 def _saveTaskResults(self, my_web):
     #只过滤包含正文的网页
     str = '.*\w{16}\.((html)|(shtml))'
     url, pageSource = my_web.getDatas()
     r = re.search(str, url)
     if r is not None:
         soup = BeautifulSoup(pageSource)
         if soup.h2 is not None:
             title = unicode(soup.h2.string)
         elif soup.p is not None:
             title = unicode(soup.p.string)
         else:
             title = 'no title'
         text = ''
         for i in soup.find_all('p'):
             text += unicode(i.get_text())
         #tmp = trieKmp.gao(title + text)
         t1 = trieKmp.gao(title)
         t2 = trieKmp.gao(text)
         tmp = []
         for i in xrange(len(t1)):
             if t1[i] != '0':
                 tmp.append('9')
             else:
                 tmp.append(t2[i])
         res = ''.join(tmp)
         #print 'res=', res
         # print 'text=', text, 'tmp=', tmp
         # print 'tmp=', tmp
         self.database.saveData(url, title, text[:40], res)
     return 0
コード例 #3
0
ファイル: ask.py プロジェクト: liangsheng/crawler_my
from AC import trieKmp
from database import Database

reload(sys)
sys.setdefaultencoding('utf-8')

data = Database('data.db')
h = data.find_a()
N = len(h)
#print len(h)

while True:
    s = raw_input('input the keywords:')
    query = unicode(s)
    d = trieKmp.gao(query)
    fc = []
    for i in xrange(trieKmp.num):
        if d[i] == '0':
            continue
        fc.append(i)
    M = len(fc)
    print '分词结果为: ',
    for ch in fc:
        print trieKmp.word[ch],
    print ''
    g = []
    for i in xrange(N):
        cnt = 0
        for j in fc:
            cnt += int(h[i][1][j])
コード例 #4
0
ファイル: ask.py プロジェクト: liangsheng/crawler_my
from AC import trieKmp
from database import Database

reload(sys)
sys.setdefaultencoding('utf-8')

data = Database('data.db')
h = data.find_a()
N = len(h)
#print len(h)

while True:
    s = raw_input('input the keywords:')
    query = unicode(s)
    d = trieKmp.gao(query)
    fc = []
    for i in xrange(trieKmp.num):
        if d[i] == '0':
            continue
        fc.append(i)
    M = len(fc)
    print '分词结果为: ', 
    for ch in fc:
        print trieKmp.word[ch],
    print ''
    g = []
    for i in xrange(N):
        cnt = 0
        for j in fc:
            cnt += int(h[i][1][j])