def crawl(self, trackingTimestamp): config = Configuration.Configuration.readFromFile(); countLimit = 65535 if config.maxFetchCount == -1 else config.maxFetchCount urlsToFetch = self.fetchURL(trackingTimestamp, countLimit) if len(urlsToFetch) == 0: print 'No URL to fetch.' return for url in urlsToFetch: print 'URL to fetch: ' + str(url) fetcher = Fetcher1.Fetcher1() html = fetcher.fetch(url.url, config) parser = Parser1.Parser1() parseResult = parser.parse(html, url.url) if parseResult.content != None: try: CRM.saveEnterprise(parseResult.content); except: print traceback.format_exc() tracker = Tracker1.Tracker1() basePath = url.url[:url.url.find("/", 7)] tracker.updateTrackTime(url.id) tracker.track(parseResult.newSeeds, url.id, self.id, basePath) print 'Sleep ' + str(config.interval) + ' second.' time.sleep(config.interval)
def crawl(self, trackingTimestamp, keyword = None): config = Configuration.Configuration.readFromFile(); countLimit = 65535 if config.maxFetchCount == -1 else config.maxFetchCount urlsToFetch = self.fetchURL(trackingTimestamp, countLimit) if len(urlsToFetch) == 0: print 'No URL to fetch.' return fetcher = Fetcher2.Fetcher2() parser = Parser2.Parser2() count = 0 tracker = Tracker2.Tracker2() for url in urlsToFetch: if count >= countLimit: print 'Fetch count limitation reached: ' + str(countLimit) break; count += 1; print 'URL to fetch: ' + str(url) html = fetcher.fetch(url.url, config) if parser.needLogin(html): print 'Need to Login' html = fetcher.login(self.username, self.password) if parser.needLogin(html): raise Exception("Login fail!") print 'Login success!' html = fetcher.fetch(url.url, config) if parser.isDetailPage(html): parseResult = parser.parse(html, url.url, config) if parseResult.content != None: try: CRM.saveEnterprise(parseResult.content); except: print traceback.format_exc() tracker.updateTrackTime(url.id) tracker.track(parseResult.newSeeds, url.id, self.id, None) elif keyword != None: print 'Search term: ' + keyword html = fetcher.search(keyword) tracker.updateTrackTime(url.id) page = 1 while (True): parseSearchResult = parser.parseSearchResult(html) tracker.track(parseSearchResult.newSeeds, url.id, self.id, None) if parseSearchResult.newSeedRightNow == None or count >= countLimit: print 'parseSearchResult.newSeedRightNow == None: ' + str(parseSearchResult.newSeedRightNow == None) print 'count >= countLimit: ' + str(count >= countLimit) break page += 1 print 'Will crawl page ' + str(page) + ': ' + parseSearchResult.newSeedRightNow['href'] print 'Sleep ' + str(config.interval) + ' second.' time.sleep(config.interval) html = fetcher.fetch(parseSearchResult.newSeedRightNow['href'], config) if html == None: retryTimes = 0 while (retryTimes < config.maxRetryTimes and html == None): retryTimes += 1 print 'Retry ' + str(retryTimes) html = fetcher.fetch(parseSearchResult.newSeedRightNow['href'], config) count += 1 print 'Sleep ' + str(config.interval) + ' second.' time.sleep(config.interval)
def testGetCountryId(self): country_id = CRM.getCountryId('China'); print 'Country Id: ' + str(country_id) self.assertTrue(country_id > 0); pass
def testSaveEnterprise(self): enterprise = Enterprise('testSaveEnterprise', 'admin', '*****@*****.**', '123456', '234567', '345678', 'zhyfoundry-spider', 'remark', 'keyword', 'China') CRM.saveEnterprise(enterprise); pass
def crawl(self, trackingTimestamp, keyword=None): config = Configuration.Configuration.readFromFile() countLimit = 65535 if config.maxFetchCount == -1 else config.maxFetchCount urlsToFetch = self.fetchURL(trackingTimestamp, countLimit) if len(urlsToFetch) == 0: print 'No URL to fetch.' return fetcher = Fetcher2.Fetcher2() parser = Parser2.Parser2() count = 0 tracker = Tracker2.Tracker2() for url in urlsToFetch: if count >= countLimit: print 'Fetch count limitation reached: ' + str(countLimit) break count += 1 print 'URL to fetch: ' + str(url) html = fetcher.fetch(url.url, config) if parser.needLogin(html): print 'Need to Login' html = fetcher.login(self.username, self.password) if parser.needLogin(html): raise Exception("Login fail!") print 'Login success!' html = fetcher.fetch(url.url, config) if parser.isDetailPage(html): parseResult = parser.parse(html, url.url, config) if parseResult.content != None: try: CRM.saveEnterprise(parseResult.content) except: print traceback.format_exc() tracker.updateTrackTime(url.id) tracker.track(parseResult.newSeeds, url.id, self.id, None) elif keyword != None: print 'Search term: ' + keyword html = fetcher.search(keyword) tracker.updateTrackTime(url.id) page = 1 while (True): parseSearchResult = parser.parseSearchResult(html) tracker.track(parseSearchResult.newSeeds, url.id, self.id, None) if parseSearchResult.newSeedRightNow == None or count >= countLimit: print 'parseSearchResult.newSeedRightNow == None: ' + str( parseSearchResult.newSeedRightNow == None) print 'count >= countLimit: ' + str( count >= countLimit) break page += 1 print 'Will crawl page ' + str( page ) + ': ' + parseSearchResult.newSeedRightNow['href'] print 'Sleep ' + str(config.interval) + ' second.' time.sleep(config.interval) html = fetcher.fetch( parseSearchResult.newSeedRightNow['href'], config) if html == None: retryTimes = 0 while (retryTimes < config.maxRetryTimes and html == None): retryTimes += 1 print 'Retry ' + str(retryTimes) html = fetcher.fetch( parseSearchResult.newSeedRightNow['href'], config) count += 1 print 'Sleep ' + str(config.interval) + ' second.' time.sleep(config.interval)