コード例 #1
0
ファイル: app.py プロジェクト: alexgheorghiu/amalgam
def crawl_view_pages():
    delegate = Delegate()
    try:
        id = request.args.get('id', type=int)
        crawl = delegate.crawl_get_by_id(id)
        # links = delegate.url_get_all_by_crawl_id(id)
        resources = delegate.resource_get_all_by_crawl(id)
        user = delegate.user_get_by_id(session['user_id'])
        sites = delegate.site_get_all(
        )  # TODO: In the future show only sites for current user
        return render_template('crawl_view_pages.html',
                               crawl=crawl,
                               resources=resources,
                               user=user,
                               sites=sites)
    except ValueError as ve:
        flash('No crawl id.')
        return redirect(url_for('crawl'))
コード例 #2
0
def bar_data(crawl_id, bar_no):
    """Returns the pages for a bar (interval) inside a crawl"""
    delegate = Delegate()

    # Find the total number of internal full links = T
    no_total = delegate.url_count_internal_full(crawl_id)

    # Select all pages
    pages = delegate.resource_get_all_by_crawl(crawl_id)

    (lower, upper) = no_2_interval(bar_no)
    selected_pages = []
    for page in pages:
        no = delegate.url_count_incoming_for_resource(page.id)
        percent = no * 100 / no_total
        if upper == 100:
            if lower <= percent and percent <= upper:
                selected_pages.append({'page': page, 'no': no})
        else:
            if lower <= percent and percent < upper:
                selected_pages.append({'page': page, 'no': no})

    return selected_pages
コード例 #3
0
def inner_links_data(crawl_id):
    delegate = Delegate()
    """
    crawl_id - crawl id
    """
    intervals = []
    for i in range(0, 100, STEP):
        intervals.append([i, i + STEP])
    print("Intervals %r " % intervals)

    # Select all pages
    pages = delegate.resource_get_all_by_crawl(crawl_id)

    # For every page select the no of internal full urls pointing to it. = Li
    d = dict()
    check = 0
    for page in pages:
        no = delegate.url_count_incoming_for_resource(page.id)
        d[page.id] = no
        check = check + no

    for k, v in d.items():
        print("\n%d -> %d" % (k, v))

    # Find the total number of internal full links = T
    no_total = delegate.url_count_internal_full(crawl_id)
    print("Total full internal links: %d " % no_total)

    assert check == no_total, "The no of total internal links do not match"

    # For every page select the percent % of internal full urls pointing to it. Pi = Li * 100 / T
    percents = dict()
    for page in pages:
        percents[page.id] = d[page.id] * 100 / no_total

    print("\nPercentages")
    for k, v in percents.items():
        print("\n%d -> %.2f%%" % (k, v))

    # Count total links for every interval I1[0-10], I2[10-20],...., I10[90-100] the number of links for the pages
    # that fall into that interval
    #    I1....Ti1...Pi1 = Ti1 *100 /T
    #    I2....Ti2...Pi2 = Ti1 * 100 / T

    # Compute percentage of every interval

    partials = dict()
    labels = []
    for interval in intervals:
        key = "{}-{}%".format(interval[0], interval[1])
        labels.append(key)
        partials[key] = 0
        for page in pages:
            if interval[1] == 100:
                if interval[0] <= percents[page.id] <= interval[1]:
                    partials[key] = partials[key] + percents[page.id]
            else:
                if interval[0] <= percents[page.id] < interval[1]:
                    partials[key] = partials[key] + percents[page.id]

    print("\nPartials")
    for k, v in partials.items():
        print("\n{} {} ".format(k, v))

    # Prepare the char data, sample bellow
    '''
    {
                labels: ['Red', 'Blue', 'Yellow', 'Green', 'Purple', 'Orange'],
                datasets: [{
                    label: '# of Votes',
                    data: [12, 19, 3, 5, 2, 3],
                    backgroundColor: [
                        'rgba(255, 99, 132, 0.2)',
                        'rgba(54, 162, 235, 0.2)',
                        'rgba(255, 206, 86, 0.2)',
                        'rgba(75, 192, 192, 0.2)',
                        'rgba(153, 102, 255, 0.2)',
                        'rgba(255, 159, 64, 0.2)'
                    ],
                    borderColor: [
                        'rgba(255, 99, 132, 1)',
                        'rgba(54, 162, 235, 1)',
                        'rgba(255, 206, 86, 1)',
                        'rgba(75, 192, 192, 1)',
                        'rgba(153, 102, 255, 1)',
                        'rgba(255, 159, 64, 1)'
                    ],
                    borderWidth: 1
                }]
            }
    '''

    new_data = {
        'labels': list(partials.keys()),
        'datasets': [{
            'label': 'Inner links',
            'data': list(partials.values())
        }]
    }

    return new_data
コード例 #4
0
    def test_page(self):
        delegate = XDelegate()

        print("test_page started")
        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        no_pages = delegate.resource_count_visited(crawl.id)
        assert no_pages == 0, "No of pages is {}".format(no_pages)

        # Page
        craw_resources = delegate.resource_get_all_by_crawl(crawl.id)
        assert len(craw_resources) == 0

        # test resource_get_by_absolute_url_and_crawl_id()
        r1 = delegate.resource_get_by_absolute_url_and_crawl_id(
            "no such url :p", crawl.id)
        assert r1 == None

        # test resource_is_present()
        present = delegate.resource_is_present('no such url :p', crawl.id)
        assert not present

        page = Resource()
        page.crawl_id = crawl.id
        page.content = "A long content " + "a" * 1024 * 1024
        page.absolute_url = "https://scriptoid.com/index.php"
        delegate.resource_create(page)
        assert page.id > 0

        # test resource_get_by_id()
        r2 = delegate.resource_get_by_id(page.id)
        assert r2.id == page.id

        # test resource_is_present()
        present = delegate.resource_is_present(page.absolute_url, crawl.id)
        assert present

        pages = delegate.resource_get_all()
        assert len(pages) > 0

        no_pages = delegate.resource_count_visited(crawl.id)
        assert no_pages == 1, "No of pages is {}".format(no_pages)

        craw_resources = delegate.resource_get_all_by_crawl(crawl.id)
        assert len(craw_resources) > 0

        r1 = delegate.resource_get_by_absolute_url_and_crawl_id(
            page.absolute_url, crawl.id)
        assert r1.id == page.id

        # # Test cascade delete
        delegate.crawl_delete_all()
        pages = delegate.resource_get_all()
        assert len(pages) == 0, "It should be {} but we found {}".format(
            0, len(pages))

        # # Clean up
        delegate.resource_delete_all()
        delegate.crawl_delete_all()
        delegate.site_delete_all()

        print("test_page done")