def run_crawler(duration = 120): KeyWords.load_link_kw() pl = PageLoader.PageLoader_v2() #pl.add_task(PageLoader.PageLoaderTask('http://bookz.ru/')) res = Results.Results() rf = ResultFilter.ResultFilter(res) rank = Ranker.Ranker(pl, rf) visitHistory = VisitHistory.VisitHistory() sched = Scheduler.Scheduler(pl, rf, rank, visitHistory, 40) rank.active = True sched.active = True print 'all initialized. Addint start page to PageLoader in 3 seconds' #pl.add_task(PageLoaderTask('http://bookz.ru')) #print 'added start page' print 'now running for %i seconds' % duration sleep(duration) pl.enter_sleep_mode() waited_for = 0 while (not sched.ready_to_shutdown()) and waited_for < 60: waited_for += 1 sleep(1) sched.active = False rank.active = False sched.save_state() pl.save_state() res.save_state() visitHistory.save_state() res.list_sites()
# -*- coding: UTF-8 -*- __author__ = 'Павел' from crawler import PageLoader, Ranker, Scheduler, ResultFilter, Results from crawler.PageLoader import PageLoaderTask from time import sleep from os import environ from KeyWords import KeyWords environ['http_proxy'] = 'http://192.168.0.2:3128' if __name__ == '__main__': KeyWords.load_link_kw(); pl = PageLoader.PageLoader_v2() pl.add_task(PageLoaderTask('http://bookz.ru')) res = Results.Results() rf = ResultFilter.ResultFilter(res) rank = Ranker.Ranker(pl, rf) rank.active = True print 'now will exit'
class LinkWeighter: def __init__( self, kw = None ): self.kw = kw if self.kw is None: self.kw = KeyWords() self.kw.loadLinkKW() def weightLinksOnPage( self, bodySoup, winSize = 3 ): #print self.kw.words links= {} ce = ContentExtractor() mc = ce.extractMappedContentList(bodySoup) for i in mc[0]: if type(i) != types.IntType: continue link = mc[1][i].get('href') links[link] = self.weightWindowed( mc[0], i, winSize ) # max(...) return links def weightWindowed( self, MappedContent, linkId, winSize, method = 'link' ): if type(winSize) == types.IntType: winL = winSize winR = winSize else: winL = winSize[0] winR = winSize[1] if method == 'link': text = self.getTextInLinkWindow( MappedContent, linkId, winL, winR ) elif method == 'char': text = self.getTextInLinkWindow( MappedContent, linkId, winL, winR ) else: raise AttributeError words = set(re.split('[ \t\n]', text.lower())) words = self.kw.words & words weight = 1.0 for word in words: weight = weight + self.kw.keyWords[word] * (10 - weight) / 10 return weight def getTextInCharWindow( self, MappedContent, linkId, winSizeL = 100, winSizeR = 100 ): winCenter = linkId * 2 + 1 content_len = len(MappedContent) winCharSize = 0 index = winCenter - 1 lWin = '' rWin = '' while winCharSize < winSizeL and index >= 0: newStr = MappedContent[index] lWin = newStr + lWin winCharSize += len(newStr) index -= 2 index = winCenter + 1 winCharSize = 0 while winCharSize < winSizeR and index < content_len: newStr = MappedContent[index] rWin += newStr winCharSize += len(newStr) index += 2 return lWin + rWin def getTextInLinkWindow( self, MappedContent, linkId, winSizeL = 1, winSizeR = 1 ): winCenter = linkId * 2 + 1 content_len = len(MappedContent) index = winCenter - 1 lWin = '' rWin = '' while winSizeL != 0 and index >= 0: newStr = MappedContent[index] if type(newStr) == types.IntType: newStr = str(newStr) lWin = ''.join(newStr + lWin) index -= 2 winSizeL -= 1 index = winCenter + 1 winSizeR += 1 while winSizeR != 0 and index < content_len: newStr = MappedContent[index] if type(newStr) == types.IntType: newStr = str(newStr) rWin += newStr index += 2 winSizeR -= 1 return lWin + rWin
def __init__( self, kw = None ): self.kw = kw if self.kw is None: self.kw = KeyWords() self.kw.loadLinkKW()