コード例 #1
0
    def update(self):
        for g in self.frame.get(OneUnProcessedGroup):
            print "Got a Group"
            outputLinks, urlResps = process_url_group(g, self.UserAgentString)
            for urlResp in urlResps:
                if urlResp.bad_url and self.UserAgentString not in set(urlResp.dataframe_obj.bad_url):
                    urlResp.dataframe_obj.bad_url += [self.UserAgentString]

            for l in outputLinks:
                if is_valid(l) and robot_manager.Allowed(l, self.UserAgentString):
                    lObj = ProducedLink(l, self.UserAgentString)
                    self.frame.add(lObj)
        if len(url_count) >= MAX_LINKS_TO_DOWNLOAD:
            self.done = True

            #write out the analytics result to disk
            outputFile = open('Analytics.txt', 'w')
            outputFile.write('Received invalid links from frontier:' + str(analytics.invalidUrlCount) + '\n')

            outputFile.write('Page with the most out links is:' + analytics.maxOutPutUrl)
            outputFile.write(', out links count is:' + str(analytics.maxOutPutUrlCount) + '\n')

            for key in analytics.subDomainDic:
                outputFile.write('Received subdomains:' + key)
                outputFile.write(', number of urls it has:' + str(analytics.subDomainDic.get(key)) + '\n')

            outputFile.close()
コード例 #2
0
 def update(self):
     for g in self.frame.get(OneUnProcessedGroup):
         print "Got a Group"
         outputLinks = process_url_group(g, self.UserAgentString)
         for l in outputLinks:
             if is_valid(l) and robot_manager.Allowed(
                     l, self.UserAgentString):
                 lObj = ProducedLink(l, self.UserAgentString)
                 self.frame.add(lObj)
     if url_count >= MAX_LINKS_TO_DOWNLOAD:
         self.done = True
コード例 #3
0
 def update(self):
     for g in self.frame.get_new(OneUnProcessedGroup):
         print "Got a Group"
         outputLinks, urlResps = process_url_group(g, self.UserAgentString)
         for urlResp in urlResps:
             if urlResp.bad_url and self.UserAgentString not in set(urlResp.dataframe_obj.bad_url):
                 urlResp.dataframe_obj.bad_url += [self.UserAgentString]
         for l in outputLinks:
             if is_valid(l) and robot_manager.Allowed(l, self.UserAgentString):
                 lObj = ProducedLink(l, self.UserAgentString)
                 self.frame.add(lObj)
     if len(url_count) >= MAX_LINKS_TO_DOWNLOAD:
         self.done = True
コード例 #4
0
 def update(self):
     for g in self.frame.get(OneUnProcessedGroup):
         print "Got a Group"
         outputLinks, urlResps = process_url_group(g, self.UserAgentString)
         for urlResp in urlResps:
             if urlResp.bad_url and self.UserAgentString not in set(urlResp.dataframe_obj.bad_url): #verify url is invalid and the user agent string is not in the set of bad_urls 
                 urlResp.dataframe_obj.bad_url += [self.UserAgentString]
         for l in outputLinks:
             if is_valid(l) and robot_manager.Allowed(l, self.UserAgentString): #if the outpult linke is valid and you have permission to crawl the website 
                 lObj = ProducedLink(l, self.UserAgentString)
                 self.frame.add(lObj)
     if len(url_count) >= MAX_LINKS_TO_DOWNLOAD:
         self.done = True
コード例 #5
0
 def update(self):
     for g in self.frame.get_new(OneUnProcessedGroup):
         print "Got a Group"
         global invalid_count
         outputLinks, urlResps = process_url_group(g, self.UserAgentString)
         for urlResp in urlResps:
             if urlResp.bad_url and self.UserAgentString not in set(urlResp.dataframe_obj.bad_url):
                 urlResp.dataframe_obj.bad_url += [self.UserAgentString]
         for l in outputLinks:
             if is_valid(l) and robot_manager.Allowed(l, self.UserAgentString):
                 update_subdomain_frequencies(l)
                 lObj = ProducedLink(l, self.UserAgentString)
                 self.frame.add(lObj)
             else:
                 invalid_count += 1 # keeps track of invalid link count
     if len(url_count) >= MAX_LINKS_TO_DOWNLOAD:
         self.done = True
コード例 #6
0
    def update(self):
        for g in self.frame.get_new(OneUnProcessedGroup):
            print("Got a Group")
            outputLinks, urlResps = process_url_group(g, self.UserAgentString)
            for urlResp in urlResps:
                if urlResp.bad_url and self.UserAgentString not in set(
                        urlResp.dataframe_obj.bad_url):
                    urlResp.dataframe_obj.bad_url += [self.UserAgentString]
            for l in outputLinks:
                if is_valid(l):
                    if robot_manager.Allowed(l, self.UserAgentString):
                        lObj = ProducedLink(l, self.UserAgentString)
                        self.frame.add(lObj)

                        url = urlparse(l)
                        s = '.'.join(url.hostname.split('.')[:-2])
                        self.subdomains[s] += 1
                else:
                    self.invalid_links += 1

        if len(url_count) >= MAX_LINKS_TO_DOWNLOAD:
            self.done = True
コード例 #7
0
ファイル: crawler_frame.py プロジェクト: nicolaskang/cs211
 def update(self):
     for g in self.frame.get(OneUnProcessedGroup):
         print "Got a Group"
         outputLinks, urlResps = process_url_group(g, self.UserAgentString)
         for urlResp in urlResps:
             if urlResp.bad_url and self.UserAgentString not in set(
                     urlResp.dataframe_obj.bad_url):
                 urlResp.dataframe_obj.bad_url += [self.UserAgentString]
         for l in outputLinks:
             if is_valid(l) and robot_manager.Allowed(
                     l, self.UserAgentString):
                 lObj = ProducedLink(l, self.UserAgentString)
                 self.frame.add(lObj)
     if len(url_count) >= MAX_LINKS_TO_DOWNLOAD:
         # global  average_time
         # average_time = (time()-self.starttime)/len(url_count)
         # with open("information.txt", "a") as info:
         #     info.write("number of bad link is "+str(numBadLink))
         #     info.write("Max sub link is "+str(MaxLink))
         #     info.write("Average download time is "+str(average_time))
         #     info.write("Total number of sub urls is "+str(sub_links))
         self.done = True
コード例 #8
0
    def update(self):
        global count_invalid
        global subdomain_to_urls

        for g in self.frame.get_new(OneUnProcessedGroup):
            print "Got a Group"
            outputLinks, urlResps = process_url_group(g, self.UserAgentString)
            for urlResp in urlResps:
                if urlResp.bad_url and self.UserAgentString not in set(
                        urlResp.dataframe_obj.bad_url):
                    urlResp.dataframe_obj.bad_url += [self.UserAgentString]
            for l in outputLinks:
                if is_valid(l) and robot_manager.Allowed(
                        l, self.UserAgentString):
                    lObj = ProducedLink(l, self.UserAgentString)
                    self.frame.add(lObj)
                    subdomain_to_urls[get_subdomain(l)].add(l)
                    good_urls_time.append(time() - self.validtime)
                    self.validtime = time()
                else:
                    count_invalid += 1
        if len(url_count) >= MAX_LINKS_TO_DOWNLOAD:
            self.done = True