Example #1
0
 def start_meta(self, attributes):
     #we should usually accept meta information
     #it is becoming less and less useful.
     #Maybe in the future this will be taken out.
     #this is just a place for the web master to tell us about the page.
     #we should usually be able to find this from the page itself.
     in_desc = 0;
     in_title = 0;
     for name, value in attributes:
         if name.lower() == 'name' and value.lower()=="description":
             in_desc = 1;
         if name.lower() == 'name' and value.lower()=='title':
             in_title = 1;
         if name.lower() =='value' and in_desc:
             self.meta['description'] = value;
         if name.lower() =='value' and in_title:
             self.meta['title'] = value;
         
         #if this is a meta refresh it should have http-equiv in its name
         if name.lower() == 'http-equiv' and value.lower() == 'refresh':
             #tell that it has a meta refresh
             self.has_meta_refresh = True;
         #if this also has a content and it contains a url to refresh to
         if name.lower() == 'content' and 'url=' in value.lower():
             start_pos = value.lower().find('url=');
             end_pos = value.lower().find(';', start_pos);
             if(end_pos == -1):
                 self.meta_refresh_url = uummuuObjects.makeURL(self.url, value[start_pos+4:]);
             else:
                 self.meta_refresh_url = uummuuObjects.makeURL(self.url, value[start_pos+4:end_pos]);
Example #2
0
 def start_a(self, attributes):
     #we are in a link
     #there are many different types of links
     #we need to make sure to not follow javascript
     #we also need to not follow location tags
     #these are the tags that have a pound sign then just redirect on this page.
     link = '';
     no_follow = False;
     this_id = '';
     this_class = '';
     this_style = '';
     this_has_style = False;
     #make sure to add data to styled objects
     self.add_to_StyledObjects();
     
     for name, value in attributes:
         if name.lower() == 'id':
             this_id = value;
         if name.lower() == 'class':
             this_class = value;
         if name == 'rel':
             if value.lower() == 'nofollow':
                 no_follow = True;
         if name == "href":
             urlName = self.url;
             self.inside_an_element = 1;
             #remove any anchor references
             if(value.find('#') > 0):
                 value = value[0:value.find("#")];
             if(value[0:4] == "http"):
                 link = value;
             elif (value[0] == '/'):
                 link = uummuuObjects.makeURL(self.domain, value);
             elif (value == ''):
                 #got an anchor reference...not adding it.
                 pass;
             elif ('javascript:' in value[0:15]):
                 #got a javascript... not adding it.
                 pass;
             elif ('mailto' in value[0:6]):
                 #got a mailto...not adding it
                 pass;
             else:
                 link = uummuuObjects.makeURL(self.url, value);
         elif name.lower() == 'style':
             this_style = value;
             this_has_style = True;
     if(no_follow):
         self.no_follow_links.append(link);
     elif link != '':
         self.hyperlinks.append(link);
     if this_id == '':
         this_id = "#UummuuUnnamed"+str(self.unnamed_count);
         self.unnamed_count += 1;
     self.curr_tag = 'a'+this_id+this_class;
     self.whole_tag = self.whole_tag.lstrip().rstrip() + ' ' + 'a'+this_id+this_class; #set the current tag
     if this_has_style:
         if this_style[-1] != ';':
             this_style += ";";
         self.styleParser.add(self.whole_tag+"{"+this_style+"}", 100);
Example #3
0
 def start_link(self, attributes):
     url = '';
     self.in_style = False;
     media = "screen";
     for name, value in attributes:
         if name.lower() == 'rel' and value.lower() == 'stylesheet':
             if self.stylesheet:
                 self.num_stylesheet += 1;
             else:
                 self.stylesheet = True;
                 self.num_stylesheet = 1;
             self.in_style = True;
         elif name.lower() == 'href':
             url = value;
         elif name.lower() == 'media':
             media = value.lower();
     if self.in_style and media == 'screen':
         self.threads.append(MyThread()); #create a thread
         self.threads[self.curr_thread].url = uummuuObjects.makeURL(self.main_url, url); #assign the thread a .css url to crawl
         self.styleParser.add(self.threads[self.curr_thread].run(), 50); #crawl the css url
         self.curr_thread += 1; #increment the thread counter
     self.in_style = False;
Example #4
0
 def getHTML(self, site):
     try:
         #set the socket timeout so we don't get hung up looking for one page.
         socket.setdefaulttimeout(10);
         self.url = site;
         self.content_type = '';
         request = urllib2.Request(site);
         #set up the user agent so the server knows who's asking for this page and how to contact us.
         request.add_header("User-agent",  "UUMMUU crawler (Mozilla/5.0 compatible; http://uummuu.com/about/UUMMUUCrawl)");
         self.res = urllib2.urlopen(request);
         self.Found = True;
         self.redirected_url = '';
         try:
             self.content_type = self.res.headers['content-type'].split(';')[0];
             redirect_url = self.res.headers['content-location'];
             self.redirected_url = makeURL(self.res.url, redirect_url);
         except:
             pass;
         self.page = self.res.read();
         return True;
     except urllib2.URLError, e:
         self.Found = False;
         self.page = '';
         return False;