def readWebPage(self, urlString, depth=1, isExternal=False): webPageData = self.db.websites.search( filters=all(eq('address', WebPage.parseUrl(urlString).string))).rows() pageLinks = [] result = None if len(webPageData) == 0: return result webPageData = webPageData[0] pageId = webPageData[0] depthData = self.db.session.search('depth', all(eq('website_id', pageId))) if len(depthData) > 0: depth = depthData[0][0] result = WebPage(url=webPageData[1], depth=depth, isExternal=isExternal) query = self.db.execute( 'SELECT w.{0}, r.{0} from links join websites as w on links.{1} = w.id join websites as r on links.{2} = r.id WHERE w.id = {3};' .format(self.db.websites.fields[1], self.db.links.fields[1], self.db.links.fields[2], pageId)) for row in iter(query): pageLinks.append( WebPage(url=row[1], parent=result, depth=depth + 1)) result.links = pageLinks return result
def test(): page = WebPage(url='pduch.kis.p.lodz.pl') page.downloadContent() hist = WebsiteDatabase() hist.insertWebpage(page, connection=True) if not hist.isInThisSession(page): hist.appendSession(page) hist.readWebPage('pduch.kis.p.lodz.pl') page = WebPage(url='http://www.kis.p.lodz.pl/') print hist.wasPageVisited(page)
def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 # print 'url=\t',url webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage)
def _taskHandler(self, url): '''以_开头的函数是放在队列里供线程提取用的''' my_web = WebPage(url) #print 'F**k', my_web.fetch() if my_web.fetch(): #print 'has visited %s' % url self._saveTaskResults(my_web) self._addUnvisitedHrefs(my_web)
def selfTesting(self, args): url = 'http://www.baidu.com/' print '\nVisiting www.baidu.com' pageSource = WebPage(url).fetch() if pageSource == None: print 'Please check your network and make sure it\'s connected.\n' elif not self._isDatabaseAvaliable(): print 'Please make sure you have the permission to save data: %s\n' % args.dbFile else: self._saveTaskResults(url, pageSource) print 'Create logfile and database Successfully.' print 'Already save Baidu.com, Please check the database record.' print 'Seems No Problem!\n'
def selfTesting(self): url = 'http://www.baidu.com' print '\nVisiting www.baidu.com using directly' my_web = WebPage(url) pageSource = my_web.fetch() #测试网络链接 if pageSource == None: print 'please check your network' elif not self.isDatabaseAvaliable(): print 'please make sure you have the permission to save data: %s\n' % args.dbFile else: self._saveTaskResults(my_web) print 'save data successfully' print 'seems all is ok'
def __init__(self, args, depth=1): self.links = [WebPage(x) for x in args.url] self.depth = depth self.historyDb = WebsiteDatabase() self.done = False self.options = args self.results = {link.url.domain: Result() for link in self.links} self.cloudIndexer = CloudSearchIndexer.forDomainIndex("websites") if args.graph or args.rank: self.webGraph = Graph(distance=30.0) for link in self.links: self.webGraph.add_node(link.url.domain, radius=15, fill=(1, 0, 0, 0.5))
def _taskHandler(self, url): webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage)