Beispiel #1
0
    def crawl(self, trackingTimestamp):

        config = Configuration.Configuration.readFromFile();
        countLimit = 65535 if config.maxFetchCount == -1 else config.maxFetchCount
        urlsToFetch = self.fetchURL(trackingTimestamp, countLimit)
        if len(urlsToFetch) == 0:
            print 'No URL to fetch.'
            return
        for url in urlsToFetch:
            print 'URL to fetch: ' + str(url)
            fetcher = Fetcher1.Fetcher1()
            html = fetcher.fetch(url.url, config)

            parser = Parser1.Parser1()
            parseResult = parser.parse(html, url.url)

            if parseResult.content != None:
                try:
                    CRM.saveEnterprise(parseResult.content);
                except:
                    print traceback.format_exc()

            tracker = Tracker1.Tracker1()
            basePath = url.url[:url.url.find("/", 7)]
            tracker.updateTrackTime(url.id)
            tracker.track(parseResult.newSeeds, url.id, self.id, basePath)

            print 'Sleep ' + str(config.interval) + ' second.'
            time.sleep(config.interval)
Beispiel #2
0
    def crawl(self, trackingTimestamp, keyword = None):

        config = Configuration.Configuration.readFromFile();
        countLimit = 65535 if config.maxFetchCount == -1 else config.maxFetchCount
        urlsToFetch = self.fetchURL(trackingTimestamp, countLimit)
        if len(urlsToFetch) == 0:
            print 'No URL to fetch.'
            return
        fetcher = Fetcher2.Fetcher2()
        parser = Parser2.Parser2()
        count = 0
        tracker = Tracker2.Tracker2()
        for url in urlsToFetch:
            if count >= countLimit:
                print 'Fetch count limitation reached: ' + str(countLimit)
                break;
            count += 1;
            print 'URL to fetch: ' + str(url)
            html = fetcher.fetch(url.url, config)

            if parser.needLogin(html):
                print 'Need to Login'
                html = fetcher.login(self.username, self.password)
                if parser.needLogin(html):
                    raise Exception("Login fail!")
                print 'Login success!'
                html = fetcher.fetch(url.url, config)

            if parser.isDetailPage(html):
                parseResult = parser.parse(html, url.url, config)
                if parseResult.content != None:
                    try:
                        CRM.saveEnterprise(parseResult.content);
                    except:
                        print traceback.format_exc()
                    tracker.updateTrackTime(url.id)
                    tracker.track(parseResult.newSeeds, url.id, self.id, None)
            elif keyword != None:
                print 'Search term: ' + keyword
                html = fetcher.search(keyword)
                tracker.updateTrackTime(url.id)
                page = 1
                while (True):
                    parseSearchResult = parser.parseSearchResult(html)
                    tracker.track(parseSearchResult.newSeeds, url.id, self.id, None)
                    if parseSearchResult.newSeedRightNow == None or count >= countLimit:
                        print 'parseSearchResult.newSeedRightNow == None: ' + str(parseSearchResult.newSeedRightNow == None)
                        print 'count >= countLimit: ' + str(count >= countLimit)
                        break
                    page += 1
                    print 'Will crawl page ' +  str(page) + ': ' + parseSearchResult.newSeedRightNow['href']
                    print 'Sleep ' + str(config.interval) + ' second.'
                    time.sleep(config.interval)
                    html = fetcher.fetch(parseSearchResult.newSeedRightNow['href'], config)
                    if html == None:
                        retryTimes = 0
                        while (retryTimes < config.maxRetryTimes and html == None):
                            retryTimes += 1
                            print 'Retry ' + str(retryTimes)
                            html = fetcher.fetch(parseSearchResult.newSeedRightNow['href'], config)
                    count += 1

            print 'Sleep ' + str(config.interval) + ' second.'
            time.sleep(config.interval)
Beispiel #3
0
 def testGetCountryId(self):
     country_id = CRM.getCountryId('China');
     print 'Country Id: ' + str(country_id)
     self.assertTrue(country_id > 0);
     pass
Beispiel #4
0
 def testSaveEnterprise(self):
     enterprise = Enterprise('testSaveEnterprise', 'admin', '*****@*****.**', '123456', '234567', '345678', 'zhyfoundry-spider', 'remark', 'keyword', 'China')
     CRM.saveEnterprise(enterprise);
     pass
Beispiel #5
0
    def crawl(self, trackingTimestamp, keyword=None):

        config = Configuration.Configuration.readFromFile()
        countLimit = 65535 if config.maxFetchCount == -1 else config.maxFetchCount
        urlsToFetch = self.fetchURL(trackingTimestamp, countLimit)
        if len(urlsToFetch) == 0:
            print 'No URL to fetch.'
            return
        fetcher = Fetcher2.Fetcher2()
        parser = Parser2.Parser2()
        count = 0
        tracker = Tracker2.Tracker2()
        for url in urlsToFetch:
            if count >= countLimit:
                print 'Fetch count limitation reached: ' + str(countLimit)
                break
            count += 1
            print 'URL to fetch: ' + str(url)
            html = fetcher.fetch(url.url, config)

            if parser.needLogin(html):
                print 'Need to Login'
                html = fetcher.login(self.username, self.password)
                if parser.needLogin(html):
                    raise Exception("Login fail!")
                print 'Login success!'
                html = fetcher.fetch(url.url, config)

            if parser.isDetailPage(html):
                parseResult = parser.parse(html, url.url, config)
                if parseResult.content != None:
                    try:
                        CRM.saveEnterprise(parseResult.content)
                    except:
                        print traceback.format_exc()
                    tracker.updateTrackTime(url.id)
                    tracker.track(parseResult.newSeeds, url.id, self.id, None)
            elif keyword != None:
                print 'Search term: ' + keyword
                html = fetcher.search(keyword)
                tracker.updateTrackTime(url.id)
                page = 1
                while (True):
                    parseSearchResult = parser.parseSearchResult(html)
                    tracker.track(parseSearchResult.newSeeds, url.id, self.id,
                                  None)
                    if parseSearchResult.newSeedRightNow == None or count >= countLimit:
                        print 'parseSearchResult.newSeedRightNow == None: ' + str(
                            parseSearchResult.newSeedRightNow == None)
                        print 'count >= countLimit: ' + str(
                            count >= countLimit)
                        break
                    page += 1
                    print 'Will crawl page ' + str(
                        page
                    ) + ': ' + parseSearchResult.newSeedRightNow['href']
                    print 'Sleep ' + str(config.interval) + ' second.'
                    time.sleep(config.interval)
                    html = fetcher.fetch(
                        parseSearchResult.newSeedRightNow['href'], config)
                    if html == None:
                        retryTimes = 0
                        while (retryTimes < config.maxRetryTimes
                               and html == None):
                            retryTimes += 1
                            print 'Retry ' + str(retryTimes)
                            html = fetcher.fetch(
                                parseSearchResult.newSeedRightNow['href'],
                                config)
                    count += 1

            print 'Sleep ' + str(config.interval) + ' second.'
            time.sleep(config.interval)