Ejemplo n.º 1
0
def bar_data(crawl_id, bar_no):
    """Returns the pages for a bar (interval) inside a crawl"""
    delegate = Delegate()

    # Find the total number of internal full links = T
    no_total = delegate.url_count_internal_full(crawl_id)

    # Select all pages
    pages = delegate.resource_get_all_by_crawl(crawl_id)

    (lower, upper) = no_2_interval(bar_no)
    selected_pages = []
    for page in pages:
        no = delegate.url_count_incoming_for_resource(page.id)
        percent = no * 100 / no_total
        if upper == 100:
            if lower <= percent and percent <= upper:
                selected_pages.append({'page': page, 'no': no})
        else:
            if lower <= percent and percent < upper:
                selected_pages.append({'page': page, 'no': no})

    return selected_pages
Ejemplo n.º 2
0
def site_add():
    delegate = Delegate()
    page = request.args.get('page', type=str)
    user = delegate.user_get_by_id(session['user_id'])
    page = request.form['page']
    site_name = request.form['site']
    site_url = request.form['url']
    site = Site(name=site_name, url=site_url)
    delegate.site_create(site)

    user.current_site_id = site.id
    delegate.site_update(site)

    if page == 'home':
        return redirect(url_for('home'))
    else:
        return redirect(url_for('sites'))
Ejemplo n.º 3
0
 def __init__(self,
              delegate,
              initialLink=None,
              max_links=0,
              no_workers=10,
              id=str(uuid.uuid4())):
     Thread.__init__(self)
     self.noOfWorkers = no_workers
     self.workers = []
     self.running = True
     self.paused = False
     self.condition = RLock()
     self.delegate = Delegate()
     self.listeners = [
     ]  # A list of listeners that want to listen to messages (ex: progress) from Crawler
     self.id = id
     self.initialLink = initialLink
     if initialLink is not None:
         self.add_initial_url(initialLink)
     self.max_links = max_links
     try:
         self.domain_regex = re.compile(get_domain(initialLink))
     except Exception as ex:
         logging.error("Exception {}".format(ex))
Ejemplo n.º 4
0
import os
import argparse
import sys

from amalgam.delegatex import XDelegate
from amalgam import database
from amalgam.models.modelsx import User, Site, metadata

delegate = XDelegate()


def drop_tables():
    """Drop all tables"""
    metadata.drop_all(database.engine)

    # if database.SQLALCHEMY_DATABASE == 'sqlite':
    #     db_full_path = os.path.abspath('./amalgam.db')
    #     if os.path.isfile(db_full_path):
    #         print("Removing old database: {}".format(db_full_path))
    #         os.remove(db_full_path)
    #     else:
    #         print("No database present at : {}".format(db_full_path))
    # elif database.SQLALCHEMY_DATABASE == 'mysql':
    #     # TODO: Add a drop all table
    #     #  Maybe: https://stackoverflow.com/questions/11233128/how-to-clean-the-database-dropping-all-records-using-sqlalchemy
    #     pass


def create_tables():
    """Create all tables if needed"""
    metadata.create_all(database.engine)
Ejemplo n.º 5
0
def main():
    # domain = 'localhost:7000'
    domain = 'http://abctimetracking.com'
    max_links = 0

    # Empty DB
    from manage_db import empty, mock
    mock()

    # Parse arguments
    parser = argparse.ArgumentParser(description="A simple website crawler.")
    parser.add_argument('-d',
                        '--domain',
                        type=str,
                        default=domain,
                        help='Domain to crawl',
                        required=True)
    parser.add_argument('-w',
                        '--workers',
                        type=int,
                        default=10,
                        help='Number of workers')
    parser.add_argument('-m',
                        '--max-links',
                        type=int,
                        default=0,
                        help='Maximum no. of links to index')
    parser.add_argument('--delay',
                        type=int,
                        default=0,
                        help='Delay between requests')
    args = parser.parse_args()

    if args.domain:
        domain = args.domain
    else:
        print('No domain passed, using %s.' % domain)
        print(
            'Read usage details in file header for more information on passing arguments.'
        )

    if args.max_links:
        max_links = args.max_links

    theURL = 'http://' + domain
    noOfWorkers = args.workers

    delegate = Delegate()
    site = Site(name=domain, url=theURL)
    delegate.site_create(site)
    crawl = Crawl(site_id=site.id)
    delegate.crawl_create(crawl)

    crawler = CrawlerDB(initialLink=theURL,
                        max_links=max_links,
                        no_workers=noOfWorkers,
                        delegate=delegate,
                        id=crawl.id)
    # crawler = CrawlerDB(max_links=max_links, no_workers=noOfWorkers, delegate=delegate, id=1)

    t1 = time.time()
    crawler.start()
    crawler.join()
    t2 = time.time()
    total_time = t2 - t1

    logger.info("Total internal links visited: %d in: %ds" %
                (crawler.no_visited_urls(), total_time))
    # for url in [link.absolute_url for link in crawler.visited]:
    # 	logger.info("\t" + url)

    logger.info("Total external links: %d" % crawler.no_external_urls())
    # for url in [link.absolute_url for link in crawler.external_links]:
    # 	logger.info("\t" + url)

    # report('./crawl-requests-report.log', crawler.visited)

    # crawler.export()
    print("All done. In limbo")

    crawler = None

    time.sleep(10)
Ejemplo n.º 6
0
class CrawlerDB(Thread):
    def __init__(self,
                 delegate,
                 initialLink=None,
                 max_links=0,
                 no_workers=10,
                 id=str(uuid.uuid4())):
        Thread.__init__(self)
        self.noOfWorkers = no_workers
        self.workers = []
        self.running = True
        self.paused = False
        self.condition = RLock()
        self.delegate = Delegate()
        self.listeners = [
        ]  # A list of listeners that want to listen to messages (ex: progress) from Crawler
        self.id = id
        self.initialLink = initialLink
        if initialLink is not None:
            self.add_initial_url(initialLink)
        self.max_links = max_links
        try:
            self.domain_regex = re.compile(get_domain(initialLink))
        except Exception as ex:
            logging.error("Exception {}".format(ex))

    # def __del__(self):
    # 	self.delegate.get_session().close()

    def add_initial_url(self, address):
        logger.info("Add initial URL")
        with self.condition:
            url = Url(url=address,
                      absolute_url=address,
                      type=Url.TYPE_INTERNAL,
                      crawl_id=self.id,
                      job_status=Url.JOB_STATUS_NOT_VISITED)
            self.delegate.url_create(url)

    def no_unvisited_urls(self):
        with self.condition:
            return self.delegate.url_count_unvisited(self.id)

    def no_pending_urls(self):
        with self.condition:
            return self.delegate.url_count_pending(self.id)

    def all_unvisited_urls(self):
        with self.condition:
            return self.delegate.url_get_all_unvisited(self.id)

    def no_visited_urls(self):
        with self.condition:
            return self.delegate.url_count_visited(self.id)

    def no_visited_resources(self):
        with self.condition:
            return self.delegate.resource_count_visited(self.id)

    def no_external_urls(self):
        with self.condition:
            return self.delegate.url_count_external(self.id)

    def next_unvisited_link_id(self):
        link_id = -1
        with self.condition:
            url = self.delegate.url_get_first_unvisited(self.id)
            if url is not None:
                url.job_status = Url.JOB_STATUS_IN_PROGRESS  # Set Url as in progress
                self.delegate.url_update(url)
                # self.increaseNoOfJobs()
                link_id = url.id
        return link_id

    def mark_url_as_visited(self, url_id):
        with self.condition:
            url = self.delegate.url_get_by_id(url_id)
            url.job_status = Url.JOB_STATUS_VISITED
            self.delegate.url_update(url)

    def _type_links(self, links):
        for link in links:
            if is_internal(get_domain(self.initialLink),
                           link['absolute']):  # internal link
                link['type'] = 'internal'
            else:  # external link
                link['type'] = 'external'

    def _get_links(self, link_id):
        with self.condition:
            link = self.delegate.url_get_by_id(link_id)
            (page, links) = get_links(link.absolute_url)
            self._type_links(links)
            return page, links

    def link2url(self, link):
        url = Url(crawl_id=self.id)
        # url=link['href'], absolute_url=link['absolute'], type=link['type'],
        if 'href' in link:
            url.url = link['href']
        if 'absolute' in link:
            url.absolute_url = link['absolute']
        if 'type' in link:
            url.type = link['type']
        if 'content' in link:
            url.raw_content = str(link['content'])
            url.text = str(
                link['content']
            )  # TODO: Parse the raw_content and used only the text without HTML tags or other stuff
        return url

    def page2resource(self, page):
        resource = Resource(crawl_id=self.id)
        if 'url' in page:
            resource.absolute_url = page['url']
        if 'content' in page:
            resource.content = page['content']
        if 'elapsed' in page:
            resource.elapsed = page['elapsed']
        return resource

    def add_links(self, links, src_resource_id=None, status_code=200):
        """Add a bunch of URLs using the resource id as source (page where found it)"""
        with self.condition:
            for link in links:
                url = self.link2url(link)
                if src_resource_id is not None:
                    url.src_resource_id = src_resource_id

                    # Check if destination resource exists, and if does mark it as visited
                    try:
                        src_resource = self.delegate.resource_get_by_id(
                            src_resource_id)
                        dest_resource = self.delegate.resource_get_by_absolute_url_and_crawl_id(
                            url.absolute_url, src_resource.crawl_id)
                        if dest_resource is not None:
                            url.job_status = Url.JOB_STATUS_VISITED
                            url.dst_resource_id = dest_resource.id
                            url.status_code = status_code
                    except Exception as e:
                        logger.warning("Exception {}".format(e))

                self.delegate.url_create(url)

    def add_resource(self, page):
        with self.condition:
            if not self.delegate.resource_is_present(crawlId=self.id):
                resource = self.page2resource(page)
                self.delegate.resource_create(resource)

    def connect_url_to_destination(self, url_id, resource_id):
        with self.condition:
            url = self.delegate.url_get_by_id(url_id)
            url.dst_resource_id = resource_id
            self.delegate.url_update(url)

    def resource_get_by_absolute_url_and_crawl_id(self, address, crawler_id):
        with self.condition:
            resource = self.delegate.resource_get_by_absolute_url_and_crawl_id(
                address, crawler_id)
            return resource

    def resource_create(self, page):
        with self.condition:
            try:
                resource = self.page2resource(page)
                self.delegate.resource_create(resource)
            except Exception as e:
                logger.warn("{} Exception {}}.".format(
                    currentThread().getName(), e))
            return resource

    def run(self):

        # Initialize workers
        for i in range(self.noOfWorkers):
            self.workers.append(
                Thread(target=self.workerJob,
                       kwargs={"crawlId": self.id},
                       name="Thread-{}".format(i)))

        # Start workers
        self._start_all_workers()

        while self.running:
            logger.debug("[%s] Crawler thread cycle started." %
                         (currentThread().getName()))
            if self.paused:
                logger.debug("[%s] Crawler paused." %
                             (currentThread().getName()))
                continue

            logger.debug("[%s] Crawler check if jobs are done." %
                         (currentThread().getName()))
            if self._are_jobs_done():
                logger.debug("Crawler is shutting down")
                self.setRunning(False)
                break
            else:
                logger.debug("[%s] Crawler's jos are NOT done." %
                             (currentThread().getName()))

            logger.debug("[%s] Crawler sleep." % (currentThread().getName()))
            time.sleep(1)

        # Join them
        self._join_all_workers()

        # self.delegate.get_session().close()

        msg = {
            "status": "done",
            "visited": self.no_visited_urls(),
            "to_visit": self.no_unvisited_urls(),
            "max_links": self.max_links,
            "crawlId": self.id
        }

        self.notify(msg)

    def workerJob(self, crawlId):
        while self.running:
            logger.debug("[%s] Worker thread cycle started." %
                         (currentThread().getName()))

            if self.paused:
                continue

            # If max pages specified see if we already reached it
            if self.max_links > 0:
                no_pages_visited = self.no_visited_resources()
                if no_pages_visited >= self.max_links:
                    continue

            # Grab next job
            link_id = self.next_unvisited_link_id()
            logger.debug("[%s] Next link [%d]." %
                         (currentThread().getName(), link_id))

            if 'link_id' in locals() and link_id != -1:
                logger.debug("[%s] Current link : %d" %
                             (currentThread().getName(), link_id))
                page, links = self._get_links(link_id)
                logger.debug("[%s] Discovered [%d] links." %
                             (currentThread().getName(), len(links)))

                try:
                    with self.condition:
                        # Update links status code
                        url = self.delegate.url_get_by_id(link_id)
                        url.status_code = page['status-code']
                        self.delegate.url_update(url)

                        if page['status-code'] == 200:
                            # 1.Add Resource 2.Link URLs to (new | existing) Resources
                            resource = self.resource_get_by_absolute_url_and_crawl_id(
                                page['url'], self.id)
                            if resource is None:
                                #Add it only if max links not reached
                                maximum_reached = False
                                if self.max_links > 0:  # We have a max_link specified
                                    no_pages_visited = self.no_visited_resources(
                                    )
                                    if no_pages_visited >= self.max_links:
                                        maximum_reached = True

                                if not maximum_reached:
                                    resource = self.resource_create(page)
                                    self.connect_url_to_destination(
                                        link_id, resource.id)
                                    logger.debug(
                                        "[%s] Adding links to DB linked to resource [%d]"
                                        % (currentThread().getName(),
                                           resource.id))
                                    self.add_links(links, resource.id,
                                                   page['status-code'])
                            else:
                                # Resource already added only make the end connection
                                self.connect_url_to_destination(
                                    link_id, resource.id)
                        else:
                            pass

                        self.mark_url_as_visited(link_id)

                        msg = {
                            "status": "in_progress",
                            "visited": self.no_visited_urls(),
                            "to_visit": self.no_unvisited_urls(),
                            "max_links": self.max_links,
                            "crawlId": crawlId,
                            "currentWorker": currentThread().getName()
                        }

                        self.notify(msg)
                except Exception as e:
                    print("Error {}".format(e))

            logger.debug("[%s] cycle ended." % (currentThread().getName()))
        else:
            logger.debug("[%s] is shutting down." %
                         (currentThread().getName()))
            # self.delegate.get_session().close()

    def stop(self):
        self.setRunning(False)

    def pause(self):
        self.paused = True

    def resume(self):
        if self.paused:
            self.paused = False

    def _start_all_workers(self):
        for w in self.workers:
            w.start()

    def _are_jobs_done(self):
        # Test if noOfJobs == 0 and to_visit == 0
        # no_of_jobs = self.getNoOfJobs()

        # FIXME: If a thread grabs the initial link, while here, no_unvisited_urls() will
        # return zero (on next line) , also the no_of_jobs are zero so the Crawler
        # will initiate shutdown

        no_pending_urls = self.no_pending_urls()
        logger.debug("Crawler: _are_jobs_done(...) : no_pendind_urls = %d " %
                     (no_pending_urls, ))

        if no_pending_urls == 0:
            return True

        # Test if we have reached the max no of pages
        if self.max_links > 0:
            no_pages_visited = self.no_visited_resources()
            if no_pages_visited >= self.max_links:
                return True

        return False

    def _join_all_workers(self):
        for w in self.workers:
            w.join()

    def setRunning(self, status):
        self.running = status

    def addListener(self, callback):
        self.listeners.append(callback)

    def removeListener(self, callback):
        self.listeners.remove(callback)

    def notify(self, msg):
        for callback in self.listeners:
            callback(msg)
Ejemplo n.º 7
0
def inner_links_data(crawl_id):
    delegate = Delegate()
    """
    crawl_id - crawl id
    """
    intervals = []
    for i in range(0, 100, STEP):
        intervals.append([i, i + STEP])
    print("Intervals %r " % intervals)

    # Select all pages
    pages = delegate.resource_get_all_by_crawl(crawl_id)

    # For every page select the no of internal full urls pointing to it. = Li
    d = dict()
    check = 0
    for page in pages:
        no = delegate.url_count_incoming_for_resource(page.id)
        d[page.id] = no
        check = check + no

    for k, v in d.items():
        print("\n%d -> %d" % (k, v))

    # Find the total number of internal full links = T
    no_total = delegate.url_count_internal_full(crawl_id)
    print("Total full internal links: %d " % no_total)

    assert check == no_total, "The no of total internal links do not match"

    # For every page select the percent % of internal full urls pointing to it. Pi = Li * 100 / T
    percents = dict()
    for page in pages:
        percents[page.id] = d[page.id] * 100 / no_total

    print("\nPercentages")
    for k, v in percents.items():
        print("\n%d -> %.2f%%" % (k, v))

    # Count total links for every interval I1[0-10], I2[10-20],...., I10[90-100] the number of links for the pages
    # that fall into that interval
    #    I1....Ti1...Pi1 = Ti1 *100 /T
    #    I2....Ti2...Pi2 = Ti1 * 100 / T

    # Compute percentage of every interval

    partials = dict()
    labels = []
    for interval in intervals:
        key = "{}-{}%".format(interval[0], interval[1])
        labels.append(key)
        partials[key] = 0
        for page in pages:
            if interval[1] == 100:
                if interval[0] <= percents[page.id] <= interval[1]:
                    partials[key] = partials[key] + percents[page.id]
            else:
                if interval[0] <= percents[page.id] < interval[1]:
                    partials[key] = partials[key] + percents[page.id]

    print("\nPartials")
    for k, v in partials.items():
        print("\n{} {} ".format(k, v))

    # Prepare the char data, sample bellow
    '''
    {
                labels: ['Red', 'Blue', 'Yellow', 'Green', 'Purple', 'Orange'],
                datasets: [{
                    label: '# of Votes',
                    data: [12, 19, 3, 5, 2, 3],
                    backgroundColor: [
                        'rgba(255, 99, 132, 0.2)',
                        'rgba(54, 162, 235, 0.2)',
                        'rgba(255, 206, 86, 0.2)',
                        'rgba(75, 192, 192, 0.2)',
                        'rgba(153, 102, 255, 0.2)',
                        'rgba(255, 159, 64, 0.2)'
                    ],
                    borderColor: [
                        'rgba(255, 99, 132, 1)',
                        'rgba(54, 162, 235, 1)',
                        'rgba(255, 206, 86, 1)',
                        'rgba(75, 192, 192, 1)',
                        'rgba(153, 102, 255, 1)',
                        'rgba(255, 159, 64, 1)'
                    ],
                    borderWidth: 1
                }]
            }
    '''

    new_data = {
        'labels': list(partials.keys()),
        'datasets': [{
            'label': 'Inner links',
            'data': list(partials.values())
        }]
    }

    return new_data
Ejemplo n.º 8
0
def sites():
    delegate = Delegate()
    user = delegate.user_get_by_id(session['user_id'])
    sites = delegate.site_get_all(
    )  # TODO: In the future show only sites for current user
    return render_template('sites.html', user=user, sites=sites)
Ejemplo n.º 9
0
def report_inner_incomming_urls():
    resource_id = request.args.get('resource_id', type=int)
    delegate = Delegate()
    pages = delegate.resource_get_all_incoming_for_resource(resource_id)
    jdata = jsonpickle.encode(pages)
    return jdata
Ejemplo n.º 10
0
    def test_crawl(self):
        delegate = XDelegate()

        print("test_crawl started")
        # session = delegate.get_session()

        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        # Create a datetime 2 minutes in the past
        delta = datetime.timedelta(minutes=-2)
        t2 = crawl.date - delta

        crawl2 = Crawl(site_id=site1.id, date=t2)
        delegate.crawl_create(crawl2)
        assert crawl2.id > 0

        sites = delegate.site_get_all()
        print("No of site: {}".format(len(sites)))
        assert len(sites) == 1

        crawls = delegate.crawl_get_all()
        assert len(crawls) == 2

        crawls2 = delegate.crawl_get_all_for_site(site1.id)
        assert len(crawls) == 2

        last_crawl = delegate.crawl_get_last_for_site(site1.id)
        assert last_crawl.id == crawl2.id, "Last crawl id was {} when it should be {}".format(
            last_crawl.id, crawl2.id)

        # delegate.crawl_delete_all()
        delegate.site_delete_all()
        print("test_crawl done")
Ejemplo n.º 11
0
    def test_user(self):
        delegate = XDelegate()

        u1 = User()
        u1.email = "*****@*****.**"
        u1.password = "******"
        u1.name = "One"
        delegate.user_create(u1)
        assert u1.id > 0

        u2 = delegate.user_get_by_email_and_password(u1.email, u1.password)
        assert u1.email == u2.email
        assert u1.password == u2.password
        assert u1.id == u2.id, "U1's id:{}  U2's id:{} ".format(u1.id, u2.id)

        u2.name = u2.name + 'x'
        r = delegate.user_update(u2)
        # assert r

        u3 = delegate.user_get_by_id(u2.id)
        assert u2.id == u3.id

        assert u2.name == u3.name

        users1 = delegate.user_get_all()
        assert len(users1) == 1

        r = delegate.user_delete_by_id(u1.id)
        assert r

        users1 = delegate.user_get_all()
        assert len(users1) == 0

        delegate.user_delete_all()

        users2 = delegate.user_get_all()
        assert len(users2) == 0
Ejemplo n.º 12
0
    def test_link(self):
        delegate = XDelegate()

        print("test_page started")
        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        # Page
        page = Resource()
        page.crawl_id = crawl.id
        page.content = "Ala bala portocala"
        page.absolute_url = "https://scriptoid.com/index.php"
        delegate.resource_create(page)

        # Link

        # Test url_is_present()
        p1 = delegate.url_is_present('https://scriptoid.com/index.php',
                                     crawl.id)
        assert not p1

        # Test url_count_unvisited()
        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 0, 'n1 is {}'.format(n1)

        # Test url_get_all_by_crawl_id()
        crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id)
        assert len(crawl_urls) == 0

        # Test url_count_incoming_for_resource()
        uc1 = delegate.url_count_incoming_for_resource(page.id)
        assert uc1 == 0

        # Test url_count_internal_full()
        cif = delegate.url_count_internal_full(crawl.id)
        assert cif == 0

        url1 = Url()
        url1.src_resource_id = page.id
        url1.url = '/team'
        url1.absolute_url = 'https://scriptoid.com/team'
        url1.type = Url.TYPE_INTERNAL
        url1.crawl_id = crawl.id
        url1.job_status = Url.JOB_STATUS_IN_PROGRESS
        lid1 = delegate.url_create(url1)
        assert url1.id > 0
        assert lid1 == url1.id

        url2 = Url()
        url2.src_resource_id = page.id
        url2.dst_resource_id = page.id
        url2.url = '/contact'
        url2.absolute_url = 'https://scriptoid.com/index.php'
        url2.type = Url.TYPE_INTERNAL
        url2.crawl_id = crawl.id
        delegate.url_create(url2)
        assert url2.id > 0

        url3 = Url()
        url3.dst_resource_id = page.id
        url3.url = '/jobs'
        url3.absolute_url = 'https://scriptoid.com/jobs.php'
        url3.type = Url.TYPE_INTERNAL
        url3.crawl_id = crawl.id
        delegate.url_create(url3)
        assert url3.id > 0

        # Test url_count_incoming_for_resource()
        uc1 = delegate.url_count_incoming_for_resource(page.id)
        assert uc1 == 1

        # Test url_get_by_id()
        u1 = delegate.url_get_by_id(url1.id)
        assert u1.id == url1.id

        # Test url_is_present()
        p1 = delegate.url_is_present('https://scriptoid.com/index.php',
                                     crawl.id)
        assert p1

        # Test url_get_all_by_crawl_id()
        crawl_urls = delegate.url_get_all_by_crawl_id(crawl.id)
        assert len(crawl_urls) == 3

        # Test first unvisited link
        l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id)
        assert l1.id == url2.id, 'l1.id = {} and url.id = {}'.format(
            l1.id, url2.id)

        # Test url_get_all_unvisited()
        unvisited1 = delegate.url_get_all_unvisited(crawl.id)
        assert len(unvisited1) == 2

        # Test url_count_unvisited()
        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 2, 'n1 is {}'.format(n1)

        n2 = delegate.url_count_visited(crawl_id=crawl.id)
        assert n2 == 0, 'Actually n2 is {}'.format(n2)

        url1.job_status = Url.JOB_STATUS_VISITED
        delegate.url_update(url1)
        l1 = delegate.url_get_first_unvisited(crawl_id=crawl.id)
        assert l1.id == url2.id

        n1 = delegate.url_count_unvisited(crawl_id=crawl.id)
        assert n1 == 2, 'n1 is {}'.format(n1)

        n2 = delegate.url_count_visited(crawl_id=crawl.id)
        assert n2 == 1, 'n2 is {}'.format(n2)

        # Test url_count_internal_full()
        cif = delegate.url_count_internal_full(crawl.id)
        assert cif == 1

        # Test url_count_pending()
        ucp = delegate.url_count_pending(crawl.id)
        assert ucp == 2

        # Test url_delete_all()
        delegate.url_delete_all()
        links = delegate.url_get_all()
        assert len(links) == 0, "When actually there are {}".format(len(links))

        # Test url_count_external()
        uce = delegate.url_count_external(crawl.id)
        assert uce == 0

        url4 = Url()
        url4.dst_resource_id = page.id
        url4.url = '/jobs'
        url4.absolute_url = 'https://scriptoid.com/jobs.php'
        url4.type = Url.TYPE_EXTERNAL
        url4.crawl_id = crawl.id
        delegate.url_create(url4)
        assert url4.id > 0

        uce = delegate.url_count_external(crawl.id)
        assert uce == 1

        assert delegate.url_delete_by_id(url4.id)

        # Test a cascade delete from parent Page resource_delete_all() to Link
        url = Url()
        url.src_resource_id = page.id
        url.url = '/contact'
        url.absolute_url = 'https://scriptoid.com/index.php'
        url.type = Url.TYPE_INTERNAL
        url.crawl_id = crawl.id
        delegate.url_create(url)
        assert url.id > 0

        delegate.resource_delete_all()
        links = delegate.url_get_all()
        assert len(links) == 0, "When actually there are {}".format(len(links))

        # Clean up
        # delegate.link_delete_all()
        delegate.resource_delete_all()
        delegate.crawl_delete_all()
        delegate.site_delete_all()

        print("test_page done")
Ejemplo n.º 13
0
    def test_page(self):
        delegate = XDelegate()

        print("test_page started")
        # Site 1
        site1 = Site()
        site1.name = "Site1"
        site1.url = 'http://foo.com'
        delegate.site_create(site1)

        # Crawl
        crawl = Crawl(site_id=site1.id)
        delegate.crawl_create(crawl)
        assert crawl.id > 0

        no_pages = delegate.resource_count_visited(crawl.id)
        assert no_pages == 0, "No of pages is {}".format(no_pages)

        # Page
        craw_resources = delegate.resource_get_all_by_crawl(crawl.id)
        assert len(craw_resources) == 0

        # test resource_get_by_absolute_url_and_crawl_id()
        r1 = delegate.resource_get_by_absolute_url_and_crawl_id(
            "no such url :p", crawl.id)
        assert r1 == None

        # test resource_is_present()
        present = delegate.resource_is_present('no such url :p', crawl.id)
        assert not present

        page = Resource()
        page.crawl_id = crawl.id
        page.content = "A long content " + "a" * 1024 * 1024
        page.absolute_url = "https://scriptoid.com/index.php"
        delegate.resource_create(page)
        assert page.id > 0

        # test resource_get_by_id()
        r2 = delegate.resource_get_by_id(page.id)
        assert r2.id == page.id

        # test resource_is_present()
        present = delegate.resource_is_present(page.absolute_url, crawl.id)
        assert present

        pages = delegate.resource_get_all()
        assert len(pages) > 0

        no_pages = delegate.resource_count_visited(crawl.id)
        assert no_pages == 1, "No of pages is {}".format(no_pages)

        craw_resources = delegate.resource_get_all_by_crawl(crawl.id)
        assert len(craw_resources) > 0

        r1 = delegate.resource_get_by_absolute_url_and_crawl_id(
            page.absolute_url, crawl.id)
        assert r1.id == page.id

        # # Test cascade delete
        delegate.crawl_delete_all()
        pages = delegate.resource_get_all()
        assert len(pages) == 0, "It should be {} but we found {}".format(
            0, len(pages))

        # # Clean up
        delegate.resource_delete_all()
        delegate.crawl_delete_all()
        delegate.site_delete_all()

        print("test_page done")