Example #1
0
    def links_in_spider_base(pid, host):
        """ Put found links in MySQL """
        links_per_time_limit = 50
        c = WSCounter(
            1, 60,
            int(Registry().get('mongo').spider_urls.count() /
                links_per_time_limit))
        Urls = UrlsModel()
        host_id = HostsModel().get_id_by_name(pid, host)
        urls_add = []

        skip = 0
        while True:
            links = mongo_result_to_list(
                Registry().get('mongo').spider_urls.find().skip(skip).limit(
                    links_per_time_limit))

            for link in links:
                url = link['path'] + '?' + link['query'] if len(
                    link['query']) else link['path']
                urls_add.append({
                    'url': url,
                    'referer': link['referer'],
                    'response_code': link['code'],
                    'response_time': link['time'],
                    'size': link['size'],
                    'who_add': 'spider',
                    'spidered': link['checked']
                })
            Urls.add_mass(pid, host_id, urls_add)

            urls_add = []

            to_update = {'spidered': [], 'code': [], 'time': [], 'size': []}

            for link in links:
                url = link['path'] + '?' + link['query'] if len(
                    link['query']) else link['path']
                if link['checked']:
                    to_update['spidered'].append({'url': url, 'value': 1})
                to_update['code'].append({'url': url, 'value': link['code']})
                to_update['time'].append({'url': url, 'value': link['time']})
                to_update['size'].append({'url': url, 'value': link['size']})

            Urls.update_url_field_mass(pid, host, 'spidered',
                                       to_update['spidered'])
            Urls.update_url_field_mass(pid, host, 'response_code',
                                       to_update['code'])
            Urls.update_url_field_mass(pid, host, 'response_time',
                                       to_update['time'])
            Urls.update_url_field_mass(pid, host, 'size', to_update['size'])

            skip += len(links)

            c.up()

            if len(links) < links_per_time_limit:
                break
Example #2
0
    def links_in_spider_base(pid, host):
        """ Put found links in MySQL """
        links_per_time_limit = 50
        c = WSCounter(1, 60, int(Registry().get('mongo').spider_urls.count()/links_per_time_limit))
        Urls = UrlsModel()
        host_id = HostsModel().get_id_by_name(pid, host)
        urls_add = []

        skip = 0
        while True:
            links = mongo_result_to_list(
                Registry().get('mongo').spider_urls.find().skip(skip).limit(links_per_time_limit)
            )

            for link in links:
                url = link['path'] + '?' + link['query'] if len(link['query']) else link['path']
                urls_add.append({
                    'url': url,
                    'referer': link['referer'],
                    'response_code': link['code'],
                    'response_time': link['time'],
                    'size': link['size'],
                    'who_add': 'spider',
                    'spidered': link['checked']
                })
            Urls.add_mass(pid, host_id, urls_add)

            urls_add = []

            to_update = {
                'spidered': [],
                'code': [],
                'time': [],
                'size': []
            }

            for link in links:
                url = link['path'] + '?' + link['query'] if len(link['query']) else link['path']
                if link['checked']:
                    to_update['spidered'].append({'url': url, 'value': 1})
                to_update['code'].append({'url': url, 'value': link['code']})
                to_update['time'].append({'url': url, 'value': link['time']})
                to_update['size'].append({'url': url, 'value': link['size']})

            Urls.update_url_field_mass(pid, host, 'spidered', to_update['spidered'])
            Urls.update_url_field_mass(pid, host, 'response_code', to_update['code'])
            Urls.update_url_field_mass(pid, host, 'response_time', to_update['time'])
            Urls.update_url_field_mass(pid, host, 'size', to_update['size'])

            skip += len(links)

            c.up()

            if len(links) < links_per_time_limit:
                break
Example #3
0
    def links_in_urls_base(pid, host):
        """ Put links in url_base table (MySQL) for site tree build """
        links_per_time_limit = 50
        c = WSCounter(1, 60, Registry().get('mongo').spider_urls.count()/links_per_time_limit)
        UrlsBase = UrlsBaseModel()
        host_id = HostsModel().get_id_by_name(pid, host)

        skip = 0
        while True:
            links = mongo_result_to_list(
                Registry().get('mongo').spider_urls.find().skip(skip).limit(links_per_time_limit)
            )
            for link in links:
                url = link['path'] + '?' + link['query'] if len(link['query']) else link['path']
                UrlsBase.add_url(
                    host_id,
                    url
                )
            skip += len(links)
            c.up()

            if len(links) < links_per_time_limit:
                break
Example #4
0
    def links_in_urls_base(pid, host):
        """ Put links in url_base table (MySQL) for site tree build """
        links_per_time_limit = 50
        c = WSCounter(
            1, 60,
            Registry().get('mongo').spider_urls.count() / links_per_time_limit)
        UrlsBase = UrlsBaseModel()
        host_id = HostsModel().get_id_by_name(pid, host)

        skip = 0
        while True:
            links = mongo_result_to_list(
                Registry().get('mongo').spider_urls.find().skip(skip).limit(
                    links_per_time_limit))
            for link in links:
                url = link['path'] + '?' + link['query'] if len(
                    link['query']) else link['path']
                UrlsBase.add_url(host_id, url)
            skip += len(links)
            c.up()

            if len(links) < links_per_time_limit:
                break