Esempio n. 1
0
    def work(self):
        """
        :inheritdoc:
        """
        try:
            job = self.jobs.reserve_job(self.queue)
            if job is False:
                return False

            url = Url.find(job.payload['url_id'])

            if not url or not can_crawl_url(url):
                self.jobs.clear_job(job)
                return False

            response = self.fetch(url)

            doc = Document.from_response(response, url)
            doc.purge_docs_for_url(url)
            doc.insert()

            if doc.can_index:
                doc.discover_urls()
                doc.discover_excerpts()
                doc.discover_images()

            # Schedule the job to be repeated after some period of time
            recrawl_at = datetime.now() + self.repeat_delta
            self.jobs.reschedule_job(job, recrawl_at)
        except Exception as e:
            # Release the job back on to the queue if an error occurs
            self.jobs.release_job(job)
            print("Releasing job %d because an exception occurred" % job.id)
            raise e
Esempio n. 2
0
    def work(self):
        """
        :inheritdoc:
        """
        try:
            job = self.jobs.reserve_job(self.queue)
            if job is False:
                return False

            url = Url.find(job.payload['url_id'])

            if not url or not can_crawl_url(url):
                self.jobs.clear_job(job)
                return False

            response = self.fetch(url)

            doc = Document.from_response(response, url)
            doc.purge_docs_for_url(url)
            doc.insert()

            if doc.can_index:
                doc.discover_urls()
                doc.discover_excerpts()
                doc.discover_images()

            # Schedule the job to be repeated after some period of time
            recrawl_at = datetime.now() + self.repeat_delta
            self.jobs.reschedule_job(job, recrawl_at)
        except Exception as e:
            # Release the job back on to the queue if an error occurs
            self.jobs.release_job(job)
            print("Releasing job %d because an exception occurred" % job.id)
            raise e
Esempio n. 3
0
    def discover_urls(self):
        """
        Discover URL's in the document and save them in the database.
        """
        allowed_domains = get_allowed_domains()

        def is_allowed(u):
            return u.domain() in allowed_domains or u.domain() == ''

        insert_count = 0
        cursor = self.db.cursor()
        for link in self.soup.find_all('a'):
            url = Url(url=link.get('href'), base=self.url)
            if is_allowed(url):
                url.insert_bare(cursor)
                insert_count += 1
        
        if insert_count > 0:
            print("Discovered %d new URLs" % insert_count)

        self.db.commit()
        cursor.close()
Esempio n. 4
0
    def discover_urls(self):
        """
        Discover URL's in the document and save them in the database.
        """
        allowed_domains = get_allowed_domains()

        def is_allowed(u):
            return u.domain() in allowed_domains or u.domain() == ''

        insert_count = 0
        cursor = self.db.cursor()
        for link in self.soup.find_all('a'):
            url = Url(url=link.get('href'), base=self.url)
            if is_allowed(url):
                url.insert_bare(cursor)
                insert_count += 1

        if insert_count > 0:
            print("Discovered %d new URLs" % insert_count)

        self.db.commit()
        cursor.close()
Esempio n. 5
0
def test_default_url_parsing():
    location = 'https://syntaxleiden.nl/foo'
    u = Url(url=location)
    assert_equal(u.geturl(), location)
Esempio n. 6
0
def test_relative_url_parsing_with_scheme():
    location = '/foo'
    u = Url(url=location, base='https://syntaxleiden.nl')
    assert_equal(u.geturl(), 'https://syntaxleiden.nl/foo')
Esempio n. 7
0
def test_relative_url_parsing_with_scheme():
    location = '/foo'
    u = Url(url=location, base='https://syntaxleiden.nl')
    assert_equal(u.geturl(), 'https://syntaxleiden.nl/foo')
Esempio n. 8
0
def test_default_url_parsing():
    location = 'https://syntaxleiden.nl/foo'
    u = Url(url=location)
    assert_equal(u.geturl(), location)
Esempio n. 9
0
def add_url():
    if request.method == 'POST':
        url = Url(url=request.form['url'])
        return render_template('add_url.html', added=url.insert(), url=url)
    else:
        return render_template('add_url.html', added=None)