def start_meta(self, attributes): #we should usually accept meta information #it is becoming less and less useful. #Maybe in the future this will be taken out. #this is just a place for the web master to tell us about the page. #we should usually be able to find this from the page itself. in_desc = 0; in_title = 0; for name, value in attributes: if name.lower() == 'name' and value.lower()=="description": in_desc = 1; if name.lower() == 'name' and value.lower()=='title': in_title = 1; if name.lower() =='value' and in_desc: self.meta['description'] = value; if name.lower() =='value' and in_title: self.meta['title'] = value; #if this is a meta refresh it should have http-equiv in its name if name.lower() == 'http-equiv' and value.lower() == 'refresh': #tell that it has a meta refresh self.has_meta_refresh = True; #if this also has a content and it contains a url to refresh to if name.lower() == 'content' and 'url=' in value.lower(): start_pos = value.lower().find('url='); end_pos = value.lower().find(';', start_pos); if(end_pos == -1): self.meta_refresh_url = uummuuObjects.makeURL(self.url, value[start_pos+4:]); else: self.meta_refresh_url = uummuuObjects.makeURL(self.url, value[start_pos+4:end_pos]);
def start_a(self, attributes): #we are in a link #there are many different types of links #we need to make sure to not follow javascript #we also need to not follow location tags #these are the tags that have a pound sign then just redirect on this page. link = ''; no_follow = False; this_id = ''; this_class = ''; this_style = ''; this_has_style = False; #make sure to add data to styled objects self.add_to_StyledObjects(); for name, value in attributes: if name.lower() == 'id': this_id = value; if name.lower() == 'class': this_class = value; if name == 'rel': if value.lower() == 'nofollow': no_follow = True; if name == "href": urlName = self.url; self.inside_an_element = 1; #remove any anchor references if(value.find('#') > 0): value = value[0:value.find("#")]; if(value[0:4] == "http"): link = value; elif (value[0] == '/'): link = uummuuObjects.makeURL(self.domain, value); elif (value == ''): #got an anchor reference...not adding it. pass; elif ('javascript:' in value[0:15]): #got a javascript... not adding it. pass; elif ('mailto' in value[0:6]): #got a mailto...not adding it pass; else: link = uummuuObjects.makeURL(self.url, value); elif name.lower() == 'style': this_style = value; this_has_style = True; if(no_follow): self.no_follow_links.append(link); elif link != '': self.hyperlinks.append(link); if this_id == '': this_id = "#UummuuUnnamed"+str(self.unnamed_count); self.unnamed_count += 1; self.curr_tag = 'a'+this_id+this_class; self.whole_tag = self.whole_tag.lstrip().rstrip() + ' ' + 'a'+this_id+this_class; #set the current tag if this_has_style: if this_style[-1] != ';': this_style += ";"; self.styleParser.add(self.whole_tag+"{"+this_style+"}", 100);
def start_link(self, attributes): url = ''; self.in_style = False; media = "screen"; for name, value in attributes: if name.lower() == 'rel' and value.lower() == 'stylesheet': if self.stylesheet: self.num_stylesheet += 1; else: self.stylesheet = True; self.num_stylesheet = 1; self.in_style = True; elif name.lower() == 'href': url = value; elif name.lower() == 'media': media = value.lower(); if self.in_style and media == 'screen': self.threads.append(MyThread()); #create a thread self.threads[self.curr_thread].url = uummuuObjects.makeURL(self.main_url, url); #assign the thread a .css url to crawl self.styleParser.add(self.threads[self.curr_thread].run(), 50); #crawl the css url self.curr_thread += 1; #increment the thread counter self.in_style = False;
def getHTML(self, site): try: #set the socket timeout so we don't get hung up looking for one page. socket.setdefaulttimeout(10); self.url = site; self.content_type = ''; request = urllib2.Request(site); #set up the user agent so the server knows who's asking for this page and how to contact us. request.add_header("User-agent", "UUMMUU crawler (Mozilla/5.0 compatible; http://uummuu.com/about/UUMMUUCrawl)"); self.res = urllib2.urlopen(request); self.Found = True; self.redirected_url = ''; try: self.content_type = self.res.headers['content-type'].split(';')[0]; redirect_url = self.res.headers['content-location']; self.redirected_url = makeURL(self.res.url, redirect_url); except: pass; self.page = self.res.read(); return True; except urllib2.URLError, e: self.Found = False; self.page = ''; return False;