Example #1
0
    def configure(self):
        """ Configure this class """

        try:
            self.baseurl = urlparser.HarvestManUrl(self.configobj.url,
                                                   urltypes.URL_TYPE_ANY, 0,
                                                   self.configobj.url,
                                                   self.configobj.projdir)

            # Put the original hash of the start url in the class
            urlparser.HarvestManUrl.hashes[self.baseurl.index] = 1
            # Reset index to zero
            self.baseurl.index = 0
            objects.datamgr.add_url(self.baseurl)

        except urlparser.HarvestManUrlError:
            return False

        self.baseurl.starturl = True

        #if self.configobj.fastmode:
        try:
            self.basetracker = crawler.HarvestManUrlFetcher(
                0, self.baseurl, True)
        except Exception, e:
            print "Fatal Error:", e
            hexit(1)
Example #2
0
    def grab_url(self, url, filename=None):
        """ Download the given URL and save it to the (optional) filename """

        # If a filename is given, set outfile to it
        if filename:
            objects.config.hgetoutfile = filename
            # print 'Saving to',filename

        # We need to reset some counters and
        # data structures ...

        # Reset progress object
        objects.config.reset_progress()
        # Reset thread pool, multipart status
        self._pool.reset_multipart_data()
        # Reset monitor
        self._monitor.reset()
        # Reset mirror manager
        mirrormgr = mirrors.HarvestManMirrorManager.getInstance()
        mirrormgr.reset()

        try:
            # print objects.config.requests, objects.config.connections
            conn = connector.HarvestManUrlConnector()
            urlobj = None

            try:
                print '\nDownloading URL', url, '...'
                urlobj = urlparser.HarvestManUrl(url)
                ret = conn.url_to_file(urlobj)

                if urlobj.trymultipart and mirrormgr.used:
                    # Print stats if mirrors were used...
                    mirrormgr.print_stats()

                return HGET_DOWNLOAD_OK
            except urlparser.HarvestManUrlError, e:
                print str(e)
                print 'Error: Invalid URL "%s"' % url

                return HGET_DOWNLOAD_ERROR

        except KeyboardInterrupt, e:
            print 'Caught keyboard interrupt...'
            if urlobj: self.clean_up(conn, urlobj)

            return HGET_KEYBOARD_INTERRUPT
Example #3
0
    def __init__(self, url, absolute=False):
        self.url = url
        self.absolute = absolute
        # Url object
        self.urlobj = urlparser.HarvestManUrl(self.url)
        # By default mirror URLs are assumed to be directory URLs.
        # if this is an absolute file URL, then don't do anything
        if not absolute:
            self.urlobj.set_directory_url()

        # Reliability factor - FUTURE
        self.reliability = 1.0
        # Geo location - FUTURE
        self.geoloc = 0
        # Count of number of times
        # this mirror was used
        self.usecnt = 0
Example #4
0
    def calculate_bandwidth(self):
        """ Calculate bandwidth of the user by downloading a specific URL and timing it,
        setting a limit on maximum file size """

        # Calculate bandwidth
        bw = 0
        # Look for harvestman.conf in user conf dir
        conf = os.path.join(objects.config.userconfdir, 'harvestman.conf')
        if not os.path.isfile(conf):
            conn = connector.HarvestManUrlConnector()
            urlobj = urlparser.HarvestManUrl(
                'http://harvestmanontheweb.com/schemas/HarvestMan.xsd')
            bw = conn.calc_bandwidth(urlobj)
            bwstr = 'bandwidth=%f\n' % bw
            if bw:
                try:
                    open(conf, 'w').write(bwstr)
                except IOError, e:
                    pass
Example #5
0
    def mirror_url(self, urlobj):
        """ Return mirror URL for the given URL """

        if not self.absolute:
            relpath = self.calc_relative_path(urlobj)
            newurlobj = urlparser.HarvestManUrl(relpath, baseurl=self.urlobj)
        else:
            newurlobj = self.urlobj

        # Set mirror_url attribute
        newurlobj.mirror_url = urlobj
        # Set another attribute indicating the mirror is different
        newurlobj.mirrored = True
        newurlobj.trymultipart = True

        self.usecnt += 1
        # print '\t=>',newurlobj.get_full_url()
        # logconsole("Mirror URL %d=> %s" % (x+1, newurlobj.get_full_url()))
        return newurlobj
Example #6
0
    def process_url(self):
        """ This function downloads the data for a url and writes its files.
        It also posts the data for web pages to a data queue """

        data = ''
        # Raise "beforefetch" event...
        if objects.eventmgr.raise_event('beforefetch', self.url) == False:
            return

        if self.url.qstatus == urlparser.URL_NOT_QUEUED:
            info('Downloading', self.url.get_full_url())
            # About to fetch
            self._fetchtime = time.time()
            self.stateobj.set(self, FETCHER_DOWNLOADING)
            data = objects.datamgr.download_url(self, self.url)

        # Add webpage links in datamgr, if we managed to
        # download the url
        url_obj = self.url

        # print self.url,'=>',self.url.is_webpage()
        if self.url.is_webpage() and data:
            # Create a HarvestMan document with all data we have...

            # Create a document and keep updating it -this is useful to provide
            # information to events...
            document = url_obj.make_document(data, [], '', [])

            # Raise "beforeparse" event...
            if objects.eventmgr.raise_event('beforeparse', self.url,
                                            document) == False:
                return

            # Check if this page was already crawled
            url = self.url.get_full_url()
            sh = sha.new(data)
            # Set this hash on the URL object itself
            self.url.pagehash = str(sh.hexdigest())

            extrainfo("Parsing web page", self.url)

            self.stateobj.set(self, FETCHER_PARSING)

            links = []

            # Perform any Javascript based redirection etc
            if self._configobj.javascript:
                skipjsparse = False
                # Raise "beforejsparse" event...
                if objects.eventmgr.raise_event('beforejsparse', self.url,
                                                document) == False:
                    # Don't return, skip this...
                    skipjsparse = True

                if not skipjsparse:
                    try:
                        parser = JSParser()
                        parser.parse(data)
                        if parser.locnchanged:
                            redirect_url = parser.getLocation().href
                            extrainfo("Javascript redirection to",
                                      redirect_url)
                            links.append(
                                (urlparser.URL_TYPE_ANY, redirect_url))

                        # DOM modification parsing logic is rudimentary and will
                        # screw up original page data most of the time!

                        #elif parser.domchanged:
                        #    extrainfo("Javascript modified page DOM, using modified data to construct URLs...")
                        #    # Get new content
                        #    datatemp = repr(parser.getDocument())
                        #    # Somehow if data is NULL, don't use it
                        #    if len(datatemp) !=0:
                        #        data = datatemp
                        #    # print data
                    except JSParserException, e:
                        # No point printing this as error, since the parser is very baaaasic!
                        # debug("Javascript parsing error =>", e)
                        pass

                    # Raise "afterjsparse" event
                    objects.eventmgr.raise_event('afterjsparse',
                                                 self.url,
                                                 document,
                                                 links=links)

            parsecount = 0

            while True:
                try:
                    parsecount += 1

                    self.wp.reset()
                    self.wp.set_url(self.url)
                    self.wp.feed(data)
                    # Bug Fix: If the <base href="..."> tag was defined in the
                    # web page, relative urls must be constructed against
                    # the url provided in <base href="...">

                    if self.wp.base_url_defined():
                        url = self.wp.get_base_url()
                        if not self.url.is_equal(url):
                            debug("Base url defined, replacing", self.url)
                            # Construct a url object
                            url_obj = urlparser.HarvestManUrl(
                                url, URL_TYPE_BASE, 0, self.url,
                                self._configobj.projdir)

                            # Change document
                            objects.datamgr.add_url(url_obj)
                            document.set_url(url_obj)

                    self.wp.close()
                    # Related to issue #25 - Print a message if parsing went through
                    # in a 2nd attempt
                    if parsecount > 1:
                        extrainfo(
                            'Parsed web page successfully in second attempt',
                            self.url)
                    break
                except (SGMLParseError, IOError), e:
                    error('SGML parse error:', str(e))
                    error('Error in parsing web-page %s' % self.url)

                    if self.wp.typ == 0:
                        # Parse error occurred with Python parser
                        debug(
                            'Trying to reparse using the HarvestManSGMLOpParser...'
                        )
                        self.make_html_parser(choice=1)
                    else:
                        break
Example #7
0
class HarvestManUrlFetcher(HarvestManBaseUrlCrawler):
    """ This is the fetcher class, which downloads data for a url
    and writes its files. It also posts the data for web pages
    to a data queue """
    def __init__(self, index, url_obj=None, isThread=True):
        HarvestManBaseUrlCrawler.__init__(self, index, url_obj, isThread)
        self._fetchtime = 0
        self.stateobj.set(self, THREAD_IDLE)

    def _initialize(self):
        HarvestManBaseUrlCrawler._initialize(self)
        self._role = "fetcher"
        self.make_html_parser()

    def make_html_parser(self, choice=0):

        if choice == 0:
            self.wp = pageparser.HarvestManSimpleParser()
        elif choice == 1:
            try:
                self.wp = pageparser.HarvestManSGMLOpParser()
            except ImportError:
                self.wp = pageparser.HarvestManSimpleParser()

        # Enable/disable features
        if self.wp != None:
            for feat, val in self._configobj.htmlfeatures:
                # int feat,'=>',val
                if val: self.wp.enable_feature(feat)
                else: self.wp.disable_feature(feat)

    def get_fetch_timestamp(self):
        """ Return the time stamp before fetching """

        return self._fetchtime

    def set_url_object(self, obj):

        if not obj: return False

        try:
            prior, url_obj = obj
            # url_obj = GetUrlObject(indx)
        except TypeError:
            url_obj = obj

        return HarvestManBaseUrlCrawler.set_url_object(self, url_obj)

    def action(self):

        if self._isThread:

            if not self.resuming:
                self._loops = 0

            while not self._endflag:

                if not self.resuming:
                    if self.buffer and self._pushflag:
                        debug('Trying to push buffer...')
                        self.push_buffer()

                    self.stateobj.set(self, FETCHER_WAITING)
                    obj = objects.queuemgr.get_url_data("fetcher")

                    if not obj:
                        if self._endflag: break

                        if self.buffer and self._pushflag:
                            debug('Trying to push buffer...')
                            self.push_buffer()

                        continue

                    if not self.set_url_object(obj):
                        debug('NULL URLOBJECT', self)
                        if self._endflag: break
                        continue

                # Process to generate new objects
                # only after trying to push buffer
                # objects.
                self.process_url()

                # Raise "afterfetch" event
                objects.eventmgr.raise_event('afterfetch', self.url)

                self._loops += 1

                # Sleep for some random time
                self.sleep()

                # Set resuming flag to False
                self.resuming = False
        else:
            self.process_url()
            self.crawl_url()

    def offset_links(self, links):
        """ Calculate a new list by applying any offset params
        on the list of links """

        n = len(links)
        # Check for any links offset params - if so trim
        # the list of links to the supplied offset values
        offset_start = self._configobj.linksoffsetstart
        offset_end = self._configobj.linksoffsetend
        # Check for negative values for end offset
        # This is considered as follows.
        # -1 => Till and including end of list
        # -2 => Till and including (n-1) element
        # -3 => Till and including (n-2) element
        # like that... upto -(n-1)...
        if offset_end < 0:
            offset_end = n - (offset_end + 1)
        # If we still get negative value for offset end
        # discard it and use list till end
        if offset_end < 0:
            offset_end = n

        # Start offset should not have negative values
        if offset_start >= 0:
            return links[offset_start:offset_end]
        else:
            return links[:offset_end]

    def process_url(self):
        """ This function downloads the data for a url and writes its files.
        It also posts the data for web pages to a data queue """

        data = ''
        # Raise "beforefetch" event...
        if objects.eventmgr.raise_event('beforefetch', self.url) == False:
            return

        if self.url.qstatus == urlparser.URL_NOT_QUEUED:
            info('Downloading', self.url.get_full_url())
            # About to fetch
            self._fetchtime = time.time()
            self.stateobj.set(self, FETCHER_DOWNLOADING)
            data = objects.datamgr.download_url(self, self.url)

        # Add webpage links in datamgr, if we managed to
        # download the url
        url_obj = self.url

        # print self.url,'=>',self.url.is_webpage()
        if self.url.is_webpage() and data:
            # Create a HarvestMan document with all data we have...

            # Create a document and keep updating it -this is useful to provide
            # information to events...
            document = url_obj.make_document(data, [], '', [])

            # Raise "beforeparse" event...
            if objects.eventmgr.raise_event('beforeparse', self.url,
                                            document) == False:
                return

            # Check if this page was already crawled
            url = self.url.get_full_url()
            sh = sha.new(data)
            # Set this hash on the URL object itself
            self.url.pagehash = str(sh.hexdigest())

            extrainfo("Parsing web page", self.url)

            self.stateobj.set(self, FETCHER_PARSING)

            links = []

            # Perform any Javascript based redirection etc
            if self._configobj.javascript:
                skipjsparse = False
                # Raise "beforejsparse" event...
                if objects.eventmgr.raise_event('beforejsparse', self.url,
                                                document) == False:
                    # Don't return, skip this...
                    skipjsparse = True

                if not skipjsparse:
                    try:
                        parser = JSParser()
                        parser.parse(data)
                        if parser.locnchanged:
                            redirect_url = parser.getLocation().href
                            extrainfo("Javascript redirection to",
                                      redirect_url)
                            links.append(
                                (urlparser.URL_TYPE_ANY, redirect_url))

                        # DOM modification parsing logic is rudimentary and will
                        # screw up original page data most of the time!

                        #elif parser.domchanged:
                        #    extrainfo("Javascript modified page DOM, using modified data to construct URLs...")
                        #    # Get new content
                        #    datatemp = repr(parser.getDocument())
                        #    # Somehow if data is NULL, don't use it
                        #    if len(datatemp) !=0:
                        #        data = datatemp
                        #    # print data
                    except JSParserException, e:
                        # No point printing this as error, since the parser is very baaaasic!
                        # debug("Javascript parsing error =>", e)
                        pass

                    # Raise "afterjsparse" event
                    objects.eventmgr.raise_event('afterjsparse',
                                                 self.url,
                                                 document,
                                                 links=links)

            parsecount = 0

            while True:
                try:
                    parsecount += 1

                    self.wp.reset()
                    self.wp.set_url(self.url)
                    self.wp.feed(data)
                    # Bug Fix: If the <base href="..."> tag was defined in the
                    # web page, relative urls must be constructed against
                    # the url provided in <base href="...">

                    if self.wp.base_url_defined():
                        url = self.wp.get_base_url()
                        if not self.url.is_equal(url):
                            debug("Base url defined, replacing", self.url)
                            # Construct a url object
                            url_obj = urlparser.HarvestManUrl(
                                url, URL_TYPE_BASE, 0, self.url,
                                self._configobj.projdir)

                            # Change document
                            objects.datamgr.add_url(url_obj)
                            document.set_url(url_obj)

                    self.wp.close()
                    # Related to issue #25 - Print a message if parsing went through
                    # in a 2nd attempt
                    if parsecount > 1:
                        extrainfo(
                            'Parsed web page successfully in second attempt',
                            self.url)
                    break
                except (SGMLParseError, IOError), e:
                    error('SGML parse error:', str(e))
                    error('Error in parsing web-page %s' % self.url)

                    if self.wp.typ == 0:
                        # Parse error occurred with Python parser
                        debug(
                            'Trying to reparse using the HarvestManSGMLOpParser...'
                        )
                        self.make_html_parser(choice=1)
                    else:
                        break
                #except ValueError, e:
                #    break
                #except Exception, e:
                #
                #    break

            if self._configobj.robots:
                # Check for NOFOLLOW tag
                if not self.wp.can_follow:
                    extrainfo(
                        'URL %s defines META Robots NOFOLLOW flag, not following its children...'
                        % self.url)
                    return data

            links.extend(self.wp.links)
            # print 'LINKS=>',self.wp.links
            #for typ, link in links:
            #    print 'Link=>',link

            # Let us update some stuff on the document...
            document.keywords = self.wp.keywords[:]
            document.description = self.wp.description
            document.title = self.wp.title

            # Raise "afterparse" event...
            objects.eventmgr.raise_event('afterparse',
                                         self.url,
                                         document,
                                         links=links)

            # Apply textfilter check here. This filter is applied on content
            # or metadata and is always a crawl filter, i.e since it operates
            # on content, we cannot apply the filter before the URL is fetched.
            # However it is applied after the URL is fetched on its content. If
            # matches, then its children are not crawled...
            if objects.rulesmgr.apply_text_filter(document, self.url):
                extrainfo('Text filter - filtered', self.url)
                return data

            # Some times image links are provided in webpages as regular <a href=".."> links.
            # So in order to filer images fully, we need to check the wp.links list also.
            # Sample site: http://www.sheppeyseacadets.co.uk/gallery_2.htm
            if self._configobj.images:
                links += self.wp.images
            else:
                # Filter any links with image extensions out from links
                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \
                         netinfo.image_extns]

            #for typ, link in links:
            #    print 'Link=>',link

            self.wp.reset()

            # Filter like that for video, flash & audio
            if not self._configobj.movies:
                # Filter any links with video extension out from links...
                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \
                         netinfo.movie_extns]

            if not self._configobj.flash:
                # Filter any links with flash extension out from links...
                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \
                         netinfo.flash_extns]

            if not self._configobj.sounds:
                # Filter any links with audio extension out from links...
                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \
                         netinfo.sound_extns]

            if not self._configobj.documents:
                # Filter any links with popular documents extension out from links...
                links = [(type, link) for type, link in links if link[link.rfind('.'):].lower() not in \
                         netinfo.document_extns]

            links = self.offset_links(links)
            # print "Filtered links",links

            # Create collection object
            coll = HarvestManAutoUrlCollection(url_obj)

            children = []
            for typ, url in links:

                is_cgi, is_php = False, False

                # Not sure of the logical validity of the following 2 lines anymore...!
                # This is old code...
                if url.find('php?') != -1: is_php = True
                if typ == 'form' or is_php: is_cgi = True

                if not url or len(url) == 0: continue
                # print 'URL=>',url,url_obj.get_full_url()

                try:
                    child_urlobj = urlparser.HarvestManUrl(
                        url, typ, is_cgi, url_obj)

                    # print url, child_urlobj.get_full_url()

                    if objects.datamgr.check_exists(child_urlobj):
                        continue
                    else:
                        objects.datamgr.add_url(child_urlobj)
                        coll.addURL(child_urlobj)
                        children.append(child_urlobj)

                except urlparser.HarvestManUrlError, e:
                    error('URL Error:', e)
                    continue
Example #8
0
            # Add these links to the queue
            for url in links:
                if not url: continue

                # There is no type information - so look at the
                # extension of the URL. If ending with .css then
                # add as stylesheet type, else as generic type.

                if url.lower().endswith('.css'):
                    urltyp = URL_TYPE_STYLESHEET
                else:
                    urltyp = URL_TYPE_ANY

                try:
                    child_urlobj = urlparser.HarvestManUrl(
                        url, urltyp, False, self.url)

                    if objects.datamgr.check_exists(child_urlobj):
                        continue
                    else:
                        objects.datamgr.add_url(child_urlobj)
                        coll.addURL(child_urlobj)
                        children.append(child_urlobj)

                except urlparser.HarvestManUrlError:
                    continue

            # Update the document...
            for child in children:
                document.add_child(child)
Example #9
0
    return is_sourceforge_url(urlobj)


def is_sourceforge_url(urlobj):
    """ Is this a download from sourceforge ? """

    ret = (urlobj.domain in ('downloads.sourceforge.net', 'prdownloads.sourceforge.net') or \
           urlobj.get_full_domain() in HarvestManMirrorManager.sf_mirror_domains )

    return ret


if __name__ == "__main__":
    import config
    import logger
    import datamgr

    SetAlias(config.HarvestManStateObject())
    cfg = objects.config
    cfg.verbosity = 5
    SetAlias(logger.HarvestManLogger())
    SetLogSeverity()
    SetAlias(datamgr.HarvestManDataManager())

    search = HarvestManMirrorSearch()
    print search.search(
        urlparser.HarvestManUrl(
            'http://pv-mirror02.mozilla.org/pub/mozilla.org/firefox/releases/2.0.0.11/linux-i686/en-US/firefox-2.0.0.11.tar.gz'
        ))