def update(self): for g in self.frame.get(OneUnProcessedGroup): print "Got a Group" outputLinks, urlResps = process_url_group(g, self.UserAgentString) for urlResp in urlResps: if urlResp.bad_url and self.UserAgentString not in set(urlResp.dataframe_obj.bad_url): urlResp.dataframe_obj.bad_url += [self.UserAgentString] for l in outputLinks: if is_valid(l) and robot_manager.Allowed(l, self.UserAgentString): lObj = ProducedLink(l, self.UserAgentString) self.frame.add(lObj) if len(url_count) >= MAX_LINKS_TO_DOWNLOAD: self.done = True #write out the analytics result to disk outputFile = open('Analytics.txt', 'w') outputFile.write('Received invalid links from frontier:' + str(analytics.invalidUrlCount) + '\n') outputFile.write('Page with the most out links is:' + analytics.maxOutPutUrl) outputFile.write(', out links count is:' + str(analytics.maxOutPutUrlCount) + '\n') for key in analytics.subDomainDic: outputFile.write('Received subdomains:' + key) outputFile.write(', number of urls it has:' + str(analytics.subDomainDic.get(key)) + '\n') outputFile.close()
def update(self): for g in self.frame.get(OneUnProcessedGroup): print "Got a Group" outputLinks = process_url_group(g, self.UserAgentString) for l in outputLinks: if is_valid(l) and robot_manager.Allowed( l, self.UserAgentString): lObj = ProducedLink(l, self.UserAgentString) self.frame.add(lObj) if url_count >= MAX_LINKS_TO_DOWNLOAD: self.done = True
def update(self): for g in self.frame.get_new(OneUnProcessedGroup): print "Got a Group" outputLinks, urlResps = process_url_group(g, self.UserAgentString) for urlResp in urlResps: if urlResp.bad_url and self.UserAgentString not in set(urlResp.dataframe_obj.bad_url): urlResp.dataframe_obj.bad_url += [self.UserAgentString] for l in outputLinks: if is_valid(l) and robot_manager.Allowed(l, self.UserAgentString): lObj = ProducedLink(l, self.UserAgentString) self.frame.add(lObj) if len(url_count) >= MAX_LINKS_TO_DOWNLOAD: self.done = True
def update(self): for g in self.frame.get(OneUnProcessedGroup): print "Got a Group" outputLinks, urlResps = process_url_group(g, self.UserAgentString) for urlResp in urlResps: if urlResp.bad_url and self.UserAgentString not in set(urlResp.dataframe_obj.bad_url): #verify url is invalid and the user agent string is not in the set of bad_urls urlResp.dataframe_obj.bad_url += [self.UserAgentString] for l in outputLinks: if is_valid(l) and robot_manager.Allowed(l, self.UserAgentString): #if the outpult linke is valid and you have permission to crawl the website lObj = ProducedLink(l, self.UserAgentString) self.frame.add(lObj) if len(url_count) >= MAX_LINKS_TO_DOWNLOAD: self.done = True
def update(self): for g in self.frame.get_new(OneUnProcessedGroup): print "Got a Group" global invalid_count outputLinks, urlResps = process_url_group(g, self.UserAgentString) for urlResp in urlResps: if urlResp.bad_url and self.UserAgentString not in set(urlResp.dataframe_obj.bad_url): urlResp.dataframe_obj.bad_url += [self.UserAgentString] for l in outputLinks: if is_valid(l) and robot_manager.Allowed(l, self.UserAgentString): update_subdomain_frequencies(l) lObj = ProducedLink(l, self.UserAgentString) self.frame.add(lObj) else: invalid_count += 1 # keeps track of invalid link count if len(url_count) >= MAX_LINKS_TO_DOWNLOAD: self.done = True
def update(self): for g in self.frame.get_new(OneUnProcessedGroup): print("Got a Group") outputLinks, urlResps = process_url_group(g, self.UserAgentString) for urlResp in urlResps: if urlResp.bad_url and self.UserAgentString not in set( urlResp.dataframe_obj.bad_url): urlResp.dataframe_obj.bad_url += [self.UserAgentString] for l in outputLinks: if is_valid(l): if robot_manager.Allowed(l, self.UserAgentString): lObj = ProducedLink(l, self.UserAgentString) self.frame.add(lObj) url = urlparse(l) s = '.'.join(url.hostname.split('.')[:-2]) self.subdomains[s] += 1 else: self.invalid_links += 1 if len(url_count) >= MAX_LINKS_TO_DOWNLOAD: self.done = True
def update(self): for g in self.frame.get(OneUnProcessedGroup): print "Got a Group" outputLinks, urlResps = process_url_group(g, self.UserAgentString) for urlResp in urlResps: if urlResp.bad_url and self.UserAgentString not in set( urlResp.dataframe_obj.bad_url): urlResp.dataframe_obj.bad_url += [self.UserAgentString] for l in outputLinks: if is_valid(l) and robot_manager.Allowed( l, self.UserAgentString): lObj = ProducedLink(l, self.UserAgentString) self.frame.add(lObj) if len(url_count) >= MAX_LINKS_TO_DOWNLOAD: # global average_time # average_time = (time()-self.starttime)/len(url_count) # with open("information.txt", "a") as info: # info.write("number of bad link is "+str(numBadLink)) # info.write("Max sub link is "+str(MaxLink)) # info.write("Average download time is "+str(average_time)) # info.write("Total number of sub urls is "+str(sub_links)) self.done = True
def update(self): global count_invalid global subdomain_to_urls for g in self.frame.get_new(OneUnProcessedGroup): print "Got a Group" outputLinks, urlResps = process_url_group(g, self.UserAgentString) for urlResp in urlResps: if urlResp.bad_url and self.UserAgentString not in set( urlResp.dataframe_obj.bad_url): urlResp.dataframe_obj.bad_url += [self.UserAgentString] for l in outputLinks: if is_valid(l) and robot_manager.Allowed( l, self.UserAgentString): lObj = ProducedLink(l, self.UserAgentString) self.frame.add(lObj) subdomain_to_urls[get_subdomain(l)].add(l) good_urls_time.append(time() - self.validtime) self.validtime = time() else: count_invalid += 1 if len(url_count) >= MAX_LINKS_TO_DOWNLOAD: self.done = True
def initialize(self): self.count = 0 l = ProducedLink("http://www.ics.uci.edu", self.UserAgentString) print l.full_url self.frame.add(l)