def configure(self): """ Configure this class """ try: self.baseurl = urlparser.HarvestManUrl(self.configobj.url, urltypes.URL_TYPE_ANY, 0, self.configobj.url, self.configobj.projdir) # Put the original hash of the start url in the class urlparser.HarvestManUrl.hashes[self.baseurl.index] = 1 # Reset index to zero self.baseurl.index = 0 objects.datamgr.add_url(self.baseurl) except urlparser.HarvestManUrlError: return False self.baseurl.starturl = True #if self.configobj.fastmode: try: self.basetracker = crawler.HarvestManUrlFetcher( 0, self.baseurl, True) except Exception, e: print "Fatal Error:", e hexit(1)
def grab_url(self, url, filename=None): """ Download the given URL and save it to the (optional) filename """ # If a filename is given, set outfile to it if filename: objects.config.hgetoutfile = filename # print 'Saving to',filename # We need to reset some counters and # data structures ... # Reset progress object objects.config.reset_progress() # Reset thread pool, multipart status self._pool.reset_multipart_data() # Reset monitor self._monitor.reset() # Reset mirror manager mirrormgr = mirrors.HarvestManMirrorManager.getInstance() mirrormgr.reset() try: # print objects.config.requests, objects.config.connections conn = connector.HarvestManUrlConnector() urlobj = None try: print '\nDownloading URL', url, '...' urlobj = urlparser.HarvestManUrl(url) ret = conn.url_to_file(urlobj) if urlobj.trymultipart and mirrormgr.used: # Print stats if mirrors were used... mirrormgr.print_stats() return HGET_DOWNLOAD_OK except urlparser.HarvestManUrlError, e: print str(e) print 'Error: Invalid URL "%s"' % url return HGET_DOWNLOAD_ERROR except KeyboardInterrupt, e: print 'Caught keyboard interrupt...' if urlobj: self.clean_up(conn, urlobj) return HGET_KEYBOARD_INTERRUPT
def __init__(self, url, absolute=False): self.url = url self.absolute = absolute # Url object self.urlobj = urlparser.HarvestManUrl(self.url) # By default mirror URLs are assumed to be directory URLs. # if this is an absolute file URL, then don't do anything if not absolute: self.urlobj.set_directory_url() # Reliability factor - FUTURE self.reliability = 1.0 # Geo location - FUTURE self.geoloc = 0 # Count of number of times # this mirror was used self.usecnt = 0
def calculate_bandwidth(self): """ Calculate bandwidth of the user by downloading a specific URL and timing it, setting a limit on maximum file size """ # Calculate bandwidth bw = 0 # Look for harvestman.conf in user conf dir conf = os.path.join(objects.config.userconfdir, 'harvestman.conf') if not os.path.isfile(conf): conn = connector.HarvestManUrlConnector() urlobj = urlparser.HarvestManUrl( 'http://harvestmanontheweb.com/schemas/HarvestMan.xsd') bw = conn.calc_bandwidth(urlobj) bwstr = 'bandwidth=%f\n' % bw if bw: try: open(conf, 'w').write(bwstr) except IOError, e: pass
def mirror_url(self, urlobj): """ Return mirror URL for the given URL """ if not self.absolute: relpath = self.calc_relative_path(urlobj) newurlobj = urlparser.HarvestManUrl(relpath, baseurl=self.urlobj) else: newurlobj = self.urlobj # Set mirror_url attribute newurlobj.mirror_url = urlobj # Set another attribute indicating the mirror is different newurlobj.mirrored = True newurlobj.trymultipart = True self.usecnt += 1 # print '\t=>',newurlobj.get_full_url() # logconsole("Mirror URL %d=> %s" % (x+1, newurlobj.get_full_url())) return newurlobj
def process_url(self): """ This function downloads the data for a url and writes its files. It also posts the data for web pages to a data queue """ data = '' # Raise "beforefetch" event... if objects.eventmgr.raise_event('beforefetch', self.url) == False: return if self.url.qstatus == urlparser.URL_NOT_QUEUED: info('Downloading', self.url.get_full_url()) # About to fetch self._fetchtime = time.time() self.stateobj.set(self, FETCHER_DOWNLOADING) data = objects.datamgr.download_url(self, self.url) # Add webpage links in datamgr, if we managed to # download the url url_obj = self.url # print self.url,'=>',self.url.is_webpage() if self.url.is_webpage() and data: # Create a HarvestMan document with all data we have... # Create a document and keep updating it -this is useful to provide # information to events... document = url_obj.make_document(data, [], '', []) # Raise "beforeparse" event... if objects.eventmgr.raise_event('beforeparse', self.url, document) == False: return # Check if this page was already crawled url = self.url.get_full_url() sh = sha.new(data) # Set this hash on the URL object itself self.url.pagehash = str(sh.hexdigest()) extrainfo("Parsing web page", self.url) self.stateobj.set(self, FETCHER_PARSING) links = [] # Perform any Javascript based redirection etc if self._configobj.javascript: skipjsparse = False # Raise "beforejsparse" event... if objects.eventmgr.raise_event('beforejsparse', self.url, document) == False: # Don't return, skip this... skipjsparse = True if not skipjsparse: try: parser = JSParser() parser.parse(data) if parser.locnchanged: redirect_url = parser.getLocation().href extrainfo("Javascript redirection to", redirect_url) links.append( (urlparser.URL_TYPE_ANY, redirect_url)) # DOM modification parsing logic is rudimentary and will # screw up original page data most of the time! #elif parser.domchanged: # extrainfo("Javascript modified page DOM, using modified data to construct URLs...") # # Get new content # datatemp = repr(parser.getDocument()) # # Somehow if data is NULL, don't use it # if len(datatemp) !=0: # data = datatemp # # print data except JSParserException, e: # No point printing this as error, since the parser is very baaaasic! # debug("Javascript parsing error =>", e) pass # Raise "afterjsparse" event objects.eventmgr.raise_event('afterjsparse', self.url, document, links=links) parsecount = 0 while True: try: parsecount += 1 self.wp.reset() self.wp.set_url(self.url) self.wp.feed(data) # Bug Fix: If the <base href="..."> tag was defined in the # web page, relative urls must be constructed against # the url provided in <base href="..."> if self.wp.base_url_defined(): url = self.wp.get_base_url() if not self.url.is_equal(url): debug("Base url defined, replacing", self.url) # Construct a url object url_obj = urlparser.HarvestManUrl( url, URL_TYPE_BASE, 0, self.url, self._configobj.projdir) # Change document objects.datamgr.add_url(url_obj) document.set_url(url_obj) self.wp.close() # Related to issue #25 - Print a message if parsing went through # in a 2nd attempt if parsecount > 1: extrainfo( 'Parsed web page successfully in second attempt', self.url) break except (SGMLParseError, IOError), e: error('SGML parse error:', str(e)) error('Error in parsing web-page %s' % self.url) if self.wp.typ == 0: # Parse error occurred with Python parser debug( 'Trying to reparse using the HarvestManSGMLOpParser...' ) self.make_html_parser(choice=1) else: break
class HarvestManUrlFetcher(HarvestManBaseUrlCrawler): """ This is the fetcher class, which downloads data for a url and writes its files. It also posts the data for web pages to a data queue """ def __init__(self, index, url_obj=None, isThread=True): HarvestManBaseUrlCrawler.__init__(self, index, url_obj, isThread) self._fetchtime = 0 self.stateobj.set(self, THREAD_IDLE) def _initialize(self): HarvestManBaseUrlCrawler._initialize(self) self._role = "fetcher" self.make_html_parser() def make_html_parser(self, choice=0): if choice == 0: self.wp = pageparser.HarvestManSimpleParser() elif choice == 1: try: self.wp = pageparser.HarvestManSGMLOpParser() except ImportError: self.wp = pageparser.HarvestManSimpleParser() # Enable/disable features if self.wp != None: for feat, val in self._configobj.htmlfeatures: # int feat,'=>',val if val: self.wp.enable_feature(feat) else: self.wp.disable_feature(feat) def get_fetch_timestamp(self): """ Return the time stamp before fetching """ return self._fetchtime def set_url_object(self, obj): if not obj: return False try: prior, url_obj = obj # url_obj = GetUrlObject(indx) except TypeError: url_obj = obj return HarvestManBaseUrlCrawler.set_url_object(self, url_obj) def action(self): if self._isThread: if not self.resuming: self._loops = 0 while not self._endflag: if not self.resuming: if self.buffer and self._pushflag: debug('Trying to push buffer...') self.push_buffer() self.stateobj.set(self, FETCHER_WAITING) obj = objects.queuemgr.get_url_data("fetcher") if not obj: if self._endflag: break if self.buffer and self._pushflag: debug('Trying to push buffer...') self.push_buffer() continue if not self.set_url_object(obj): debug('NULL URLOBJECT', self) if self._endflag: break continue # Process to generate new objects # only after trying to push buffer # objects. self.process_url() # Raise "afterfetch" event objects.eventmgr.raise_event('afterfetch', self.url) self._loops += 1 # Sleep for some random time self.sleep() # Set resuming flag to False self.resuming = False else: self.process_url() self.crawl_url() def offset_links(self, links): """ Calculate a new list by applying any offset params on the list of links """ n = len(links) # Check for any links offset params - if so trim # the list of links to the supplied offset values offset_start = self._configobj.linksoffsetstart offset_end = self._configobj.linksoffsetend # Check for negative values for end offset # This is considered as follows. # -1 => Till and including end of list # -2 => Till and including (n-1) element # -3 => Till and including (n-2) element # like that... upto -(n-1)... if offset_end < 0: offset_end = n - (offset_end + 1) # If we still get negative value for offset end # discard it and use list till end if offset_end < 0: offset_end = n # Start offset should not have negative values if offset_start >= 0: return links[offset_start:offset_end] else: return links[:offset_end] def process_url(self): """ This function downloads the data for a url and writes its files. It also posts the data for web pages to a data queue """ data = '' # Raise "beforefetch" event... if objects.eventmgr.raise_event('beforefetch', self.url) == False: return if self.url.qstatus == urlparser.URL_NOT_QUEUED: info('Downloading', self.url.get_full_url()) # About to fetch self._fetchtime = time.time() self.stateobj.set(self, FETCHER_DOWNLOADING) data = objects.datamgr.download_url(self, self.url) # Add webpage links in datamgr, if we managed to # download the url url_obj = self.url # print self.url,'=>',self.url.is_webpage() if self.url.is_webpage() and data: # Create a HarvestMan document with all data we have... # Create a document and keep updating it -this is useful to provide # information to events... document = url_obj.make_document(data, [], '', []) # Raise "beforeparse" event... if objects.eventmgr.raise_event('beforeparse', self.url, document) == False: return # Check if this page was already crawled url = self.url.get_full_url() sh = sha.new(data) # Set this hash on the URL object itself self.url.pagehash = str(sh.hexdigest()) extrainfo("Parsing web page", self.url) self.stateobj.set(self, FETCHER_PARSING) links = [] # Perform any Javascript based redirection etc if self._configobj.javascript: skipjsparse = False # Raise "beforejsparse" event... if objects.eventmgr.raise_event('beforejsparse', self.url, document) == False: # Don't return, skip this... skipjsparse = True if not skipjsparse: try: parser = JSParser() parser.parse(data) if parser.locnchanged: redirect_url = parser.getLocation().href extrainfo("Javascript redirection to", redirect_url) links.append( (urlparser.URL_TYPE_ANY, redirect_url)) # DOM modification parsing logic is rudimentary and will # screw up original page data most of the time! #elif parser.domchanged: # extrainfo("Javascript modified page DOM, using modified data to construct URLs...") # # Get new content # datatemp = repr(parser.getDocument()) # # Somehow if data is NULL, don't use it # if len(datatemp) !=0: # data = datatemp # # print data except JSParserException, e: # No point printing this as error, since the parser is very baaaasic! # debug("Javascript parsing error =>", e) pass # Raise "afterjsparse" event objects.eventmgr.raise_event('afterjsparse', self.url, document, links=links) parsecount = 0 while True: try: parsecount += 1 self.wp.reset() self.wp.set_url(self.url) self.wp.feed(data) # Bug Fix: If the <base href="..."> tag was defined in the # web page, relative urls must be constructed against # the url provided in <base href="..."> if self.wp.base_url_defined(): url = self.wp.get_base_url() if not self.url.is_equal(url): debug("Base url defined, replacing", self.url) # Construct a url object url_obj = urlparser.HarvestManUrl( url, URL_TYPE_BASE, 0, self.url, self._configobj.projdir) # Change document objects.datamgr.add_url(url_obj) document.set_url(url_obj) self.wp.close() # Related to issue #25 - Print a message if parsing went through # in a 2nd attempt if parsecount > 1: extrainfo( 'Parsed web page successfully in second attempt', self.url) break except (SGMLParseError, IOError), e: error('SGML parse error:', str(e)) error('Error in parsing web-page %s' % self.url) if self.wp.typ == 0: # Parse error occurred with Python parser debug( 'Trying to reparse using the HarvestManSGMLOpParser...' ) self.make_html_parser(choice=1) else: break #except ValueError, e: # break #except Exception, e: # # break if self._configobj.robots: # Check for NOFOLLOW tag if not self.wp.can_follow: extrainfo( 'URL %s defines META Robots NOFOLLOW flag, not following its children...' % self.url) return data links.extend(self.wp.links) # print 'LINKS=>',self.wp.links #for typ, link in links: # print 'Link=>',link # Let us update some stuff on the document... document.keywords = self.wp.keywords[:] document.description = self.wp.description document.title = self.wp.title # Raise "afterparse" event... objects.eventmgr.raise_event('afterparse', self.url, document, links=links) # Apply textfilter check here. This filter is applied on content # or metadata and is always a crawl filter, i.e since it operates # on content, we cannot apply the filter before the URL is fetched. # However it is applied after the URL is fetched on its content. If # matches, then its children are not crawled... if objects.rulesmgr.apply_text_filter(document, self.url): extrainfo('Text filter - filtered', self.url) return data # Some times image links are provided in webpages as regular <a href=".."> links. # So in order to filer images fully, we need to check the wp.links list also. # Sample site: http://www.sheppeyseacadets.co.uk/gallery_2.htm if self._configobj.images: links += self.wp.images else: # Filter any links with image extensions out from links links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.image_extns] #for typ, link in links: # print 'Link=>',link self.wp.reset() # Filter like that for video, flash & audio if not self._configobj.movies: # Filter any links with video extension out from links... links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.movie_extns] if not self._configobj.flash: # Filter any links with flash extension out from links... links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.flash_extns] if not self._configobj.sounds: # Filter any links with audio extension out from links... links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.sound_extns] if not self._configobj.documents: # Filter any links with popular documents extension out from links... links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \ netinfo.document_extns] links = self.offset_links(links) # print "Filtered links",links # Create collection object coll = HarvestManAutoUrlCollection(url_obj) children = [] for typ, url in links: is_cgi, is_php = False, False # Not sure of the logical validity of the following 2 lines anymore...! # This is old code... if url.find('php?') != -1: is_php = True if typ == 'form' or is_php: is_cgi = True if not url or len(url) == 0: continue # print 'URL=>',url,url_obj.get_full_url() try: child_urlobj = urlparser.HarvestManUrl( url, typ, is_cgi, url_obj) # print url, child_urlobj.get_full_url() if objects.datamgr.check_exists(child_urlobj): continue else: objects.datamgr.add_url(child_urlobj) coll.addURL(child_urlobj) children.append(child_urlobj) except urlparser.HarvestManUrlError, e: error('URL Error:', e) continue
# Add these links to the queue for url in links: if not url: continue # There is no type information - so look at the # extension of the URL. If ending with .css then # add as stylesheet type, else as generic type. if url.lower().endswith('.css'): urltyp = URL_TYPE_STYLESHEET else: urltyp = URL_TYPE_ANY try: child_urlobj = urlparser.HarvestManUrl( url, urltyp, False, self.url) if objects.datamgr.check_exists(child_urlobj): continue else: objects.datamgr.add_url(child_urlobj) coll.addURL(child_urlobj) children.append(child_urlobj) except urlparser.HarvestManUrlError: continue # Update the document... for child in children: document.add_child(child)
return is_sourceforge_url(urlobj) def is_sourceforge_url(urlobj): """ Is this a download from sourceforge ? """ ret = (urlobj.domain in ('downloads.sourceforge.net', 'prdownloads.sourceforge.net') or \ urlobj.get_full_domain() in HarvestManMirrorManager.sf_mirror_domains ) return ret if __name__ == "__main__": import config import logger import datamgr SetAlias(config.HarvestManStateObject()) cfg = objects.config cfg.verbosity = 5 SetAlias(logger.HarvestManLogger()) SetLogSeverity() SetAlias(datamgr.HarvestManDataManager()) search = HarvestManMirrorSearch() print search.search( urlparser.HarvestManUrl( 'http://pv-mirror02.mozilla.org/pub/mozilla.org/firefox/releases/2.0.0.11/linux-i686/en-US/firefox-2.0.0.11.tar.gz' ))