Esempio n. 1
0
def crawl_exe():
    global PROGRESS_TRACKER
    global CRAWLS
    delegate = Delegate()

    @copy_current_request_context
    def notify(msg):
        crawlId = str(msg['crawlId'])

        # FIXME: Naive way to avoid leaks (byt keeping references)
        if msg['status'] == 'done':
            del CRAWLS[int(crawlId)]

        # progress = ProgressTracker._msg_to_progress(msg)
        # pj = jsonpickle.encode(progress)
        PROGRESS_TRACKER.set_progress(crawlId, msg)
        try:
            crawl = delegate.crawl_get_by_id(crawlId)
        except ValueError as ve:
            flash('No crawl id.')
            return redirect(url_for('crawl'))

    if not request.form['address']:
        flash('No address.')
        return redirect(url_for('crawl'))

    user = delegate.user_get_by_id(session['user_id'])
    sites = delegate.site_get_all(
    )  # TODO: In the future show only sites for current user

    # Save to DB
    crawl = Crawl(site_id=user.current_site_id)
    delegate.crawl_create(crawl)

    initial_url = request.form['address']
    max_links = 0
    if len(request.form['max']) > 0 and int(request.form['max']) > 0:
        max_links = int(request.form['max'])

    crawler = CrawlerDB(delegate,
                        initial_url,
                        id=crawl.id,
                        no_workers=10,
                        max_links=max_links)
    CRAWLS[crawl.id] = crawler
    crawler.addListener(notify)
    crawler.start()

    return render_template('crawl_progress.html',
                           crawl=crawl,
                           user=user,
                           sites=sites)
Esempio n. 2
0
    def test_crawl(self):
        delegate = XDelegate()

        print("test_crawl started")
        # session = delegate.get_session()

        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        # Create a datetime 2 minutes in the past
        delta = datetime.timedelta(minutes=-2)
        t2 = crawl.date - delta

        crawl2 = Crawl(site_id=site1.id, date=t2)
        delegate.crawl_create(crawl2)
        assert crawl2.id > 0

        sites = delegate.site_get_all()
        print("No of site: {}".format(len(sites)))
        assert len(sites) == 1

        crawls = delegate.crawl_get_all()
        assert len(crawls) == 2

        crawls2 = delegate.crawl_get_all_for_site(site1.id)
        assert len(crawls) == 2

        last_crawl = delegate.crawl_get_last_for_site(site1.id)
        assert last_crawl.id == crawl2.id, "Last crawl id was {} when it should be {}".format(
            last_crawl.id, crawl2.id)

        # delegate.crawl_delete_all()
        delegate.site_delete_all()
        print("test_crawl done")
Esempio n. 3
0
def main():
    # domain = 'localhost:7000'
    domain = 'http://abctimetracking.com'
    max_links = 0

    # Empty DB
    from manage_db import empty, mock
    mock()

    # Parse arguments
    parser = argparse.ArgumentParser(description="A simple website crawler.")
    parser.add_argument('-d',
                        '--domain',
                        type=str,
                        default=domain,
                        help='Domain to crawl',
                        required=True)
    parser.add_argument('-w',
                        '--workers',
                        type=int,
                        default=10,
                        help='Number of workers')
    parser.add_argument('-m',
                        '--max-links',
                        type=int,
                        default=0,
                        help='Maximum no. of links to index')
    parser.add_argument('--delay',
                        type=int,
                        default=0,
                        help='Delay between requests')
    args = parser.parse_args()

    if args.domain:
        domain = args.domain
    else:
        print('No domain passed, using %s.' % domain)
        print(
            'Read usage details in file header for more information on passing arguments.'
        )

    if args.max_links:
        max_links = args.max_links

    theURL = 'http://' + domain
    noOfWorkers = args.workers

    delegate = Delegate()
    site = Site(name=domain, url=theURL)
    delegate.site_create(site)
    crawl = Crawl(site_id=site.id)
    delegate.crawl_create(crawl)

    crawler = CrawlerDB(initialLink=theURL,
                        max_links=max_links,
                        no_workers=noOfWorkers,
                        delegate=delegate,
                        id=crawl.id)
    # crawler = CrawlerDB(max_links=max_links, no_workers=noOfWorkers, delegate=delegate, id=1)

    t1 = time.time()
    crawler.start()
    crawler.join()
    t2 = time.time()
    total_time = t2 - t1

    logger.info("Total internal links visited: %d in: %ds" %
                (crawler.no_visited_urls(), total_time))
    # for url in [link.absolute_url for link in crawler.visited]:
    # 	logger.info("\t" + url)

    logger.info("Total external links: %d" % crawler.no_external_urls())
    # for url in [link.absolute_url for link in crawler.external_links]:
    # 	logger.info("\t" + url)

    # report('./crawl-requests-report.log', crawler.visited)

    # crawler.export()
    print("All done. In limbo")

    crawler = None

    time.sleep(10)
Esempio n. 4
0
    def test_link(self):
        delegate = XDelegate()

        print("test_page started")
        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        # Page
        page = Resource()
        page.crawl_id = crawl.id
        page.content = "Ala bala portocala"
        page.absolute_url = "https://scriptoid.com/index.php"
        delegate.resource_create(page)

        # Link

        # Test url_is_present()
        p1 = delegate.url_is_present('https://scriptoid.com/index.php',
                                     crawl.id)
        assert not p1

        # Test url_count_unvisited()
        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 0, 'n1 is {}'.format(n1)

        # Test url_get_all_by_crawl_id()
        crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id)
        assert len(crawl_urls) == 0

        # Test url_count_incoming_for_resource()
        uc1 = delegate.url_count_incoming_for_resource(page.id)
        assert uc1 == 0

        # Test url_count_internal_full()
        cif = delegate.url_count_internal_full(crawl.id)
        assert cif == 0

        url1 = Url()
        url1.src_resource_id = page.id
        url1.url = '/team'
        url1.absolute_url = 'https://scriptoid.com/team'
        url1.type = Url.TYPE_INTERNAL
        url1.crawl_id = crawl.id
        url1.job_status = Url.JOB_STATUS_IN_PROGRESS
        lid1 = delegate.url_create(url1)
        assert url1.id > 0
        assert lid1 == url1.id

        url2 = Url()
        url2.src_resource_id = page.id
        url2.dst_resource_id = page.id
        url2.url = '/contact'
        url2.absolute_url = 'https://scriptoid.com/index.php'
        url2.type = Url.TYPE_INTERNAL
        url2.crawl_id = crawl.id
        delegate.url_create(url2)
        assert url2.id > 0

        url3 = Url()
        url3.dst_resource_id = page.id
        url3.url = '/jobs'
        url3.absolute_url = 'https://scriptoid.com/jobs.php'
        url3.type = Url.TYPE_INTERNAL
        url3.crawl_id = crawl.id
        delegate.url_create(url3)
        assert url3.id > 0

        # Test url_count_incoming_for_resource()
        uc1 = delegate.url_count_incoming_for_resource(page.id)
        assert uc1 == 1

        # Test url_get_by_id()
        u1 = delegate.url_get_by_id(url1.id)
        assert u1.id == url1.id

        # Test url_is_present()
        p1 = delegate.url_is_present('https://scriptoid.com/index.php',
                                     crawl.id)
        assert p1

        # Test url_get_all_by_crawl_id()
        crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id)
        assert len(crawl_urls) == 3

        # Test first unvisited link
        l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id)
        assert l1.id == url2.id, 'l1.id = {} and url.id = {}'.format(
            l1.id, url2.id)

        # Test url_get_all_unvisited()
        unvisited1 = delegate.url_get_all_unvisited(crawl.id)
        assert len(unvisited1) == 2

        # Test url_count_unvisited()
        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 2, 'n1 is {}'.format(n1)

        n2 = delegate.url_count_visited(crawl_id=crawl.id)
        assert n2 == 0, 'Actually n2 is {}'.format(n2)

        url1.job_status = Url.JOB_STATUS_VISITED
        delegate.url_update(url1)
        l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id)
        assert l1.id == url2.id

        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 2, 'n1 is {}'.format(n1)

        n2 = delegate.url_count_visited(crawl_id=crawl.id)
        assert n2 == 1, 'n2 is {}'.format(n2)

        # Test url_count_internal_full()
        cif = delegate.url_count_internal_full(crawl.id)
        assert cif == 1

        # Test url_count_pending()
        ucp = delegate.url_count_pending(crawl.id)
        assert ucp == 2

        # Test url_delete_all()
        delegate.url_delete_all()
        links = delegate.url_get_all()
        assert len(links) == 0, "When actually there are {}".format(len(links))

        # Test url_count_external()
        uce = delegate.url_count_external(crawl.id)
        assert uce == 0

        url4 = Url()
        url4.dst_resource_id = page.id
        url4.url = '/jobs'
        url4.absolute_url = 'https://scriptoid.com/jobs.php'
        url4.type = Url.TYPE_EXTERNAL
        url4.crawl_id = crawl.id
        delegate.url_create(url4)
        assert url4.id > 0

        uce = delegate.url_count_external(crawl.id)
        assert uce == 1

        assert delegate.url_delete_by_id(url4.id)

        # Test a cascade delete from parent Page resource_delete_all() to Link
        url = Url()
        url.src_resource_id = page.id
        url.url = '/contact'
        url.absolute_url = 'https://scriptoid.com/index.php'
        url.type = Url.TYPE_INTERNAL
        url.crawl_id = crawl.id
        delegate.url_create(url)
        assert url.id > 0

        delegate.resource_delete_all()
        links = delegate.url_get_all()
        assert len(links) == 0, "When actually there are {}".format(len(links))

        # Clean up
        # delegate.link_delete_all()
        delegate.resource_delete_all()
        delegate.crawl_delete_all()
        delegate.site_delete_all()

        print("test_page done")
Esempio n. 5
0
    def test_page(self):
        delegate = XDelegate()

        print("test_page started")
        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        no_pages = delegate.resource_count_visited(crawl.id)
        assert no_pages == 0, "No of pages is {}".format(no_pages)

        # Page
        craw_resources = delegate.resource_get_all_by_crawl(crawl.id)
        assert len(craw_resources) == 0

        # test resource_get_by_absolute_url_and_crawl_id()
        r1 = delegate.resource_get_by_absolute_url_and_crawl_id(
            "no such url :p", crawl.id)
        assert r1 == None

        # test resource_is_present()
        present = delegate.resource_is_present('no such url :p', crawl.id)
        assert not present

        page = Resource()
        page.crawl_id = crawl.id
        page.content = "A long content " + "a" * 1024 * 1024
        page.absolute_url = "https://scriptoid.com/index.php"
        delegate.resource_create(page)
        assert page.id > 0

        # test resource_get_by_id()
        r2 = delegate.resource_get_by_id(page.id)
        assert r2.id == page.id

        # test resource_is_present()
        present = delegate.resource_is_present(page.absolute_url, crawl.id)
        assert present

        pages = delegate.resource_get_all()
        assert len(pages) > 0

        no_pages = delegate.resource_count_visited(crawl.id)
        assert no_pages == 1, "No of pages is {}".format(no_pages)

        craw_resources = delegate.resource_get_all_by_crawl(crawl.id)
        assert len(craw_resources) > 0

        r1 = delegate.resource_get_by_absolute_url_and_crawl_id(
            page.absolute_url, crawl.id)
        assert r1.id == page.id

        # # Test cascade delete
        delegate.crawl_delete_all()
        pages = delegate.resource_get_all()
        assert len(pages) == 0, "It should be {} but we found {}".format(
            0, len(pages))

        # # Clean up
        delegate.resource_delete_all()
        delegate.crawl_delete_all()
        delegate.site_delete_all()

        print("test_page done")