def links_in_spider_base(pid, host): """ Put found links in MySQL """ links_per_time_limit = 50 c = WSCounter( 1, 60, int(Registry().get('mongo').spider_urls.count() / links_per_time_limit)) Urls = UrlsModel() host_id = HostsModel().get_id_by_name(pid, host) urls_add = [] skip = 0 while True: links = mongo_result_to_list( Registry().get('mongo').spider_urls.find().skip(skip).limit( links_per_time_limit)) for link in links: url = link['path'] + '?' + link['query'] if len( link['query']) else link['path'] urls_add.append({ 'url': url, 'referer': link['referer'], 'response_code': link['code'], 'response_time': link['time'], 'size': link['size'], 'who_add': 'spider', 'spidered': link['checked'] }) Urls.add_mass(pid, host_id, urls_add) urls_add = [] to_update = {'spidered': [], 'code': [], 'time': [], 'size': []} for link in links: url = link['path'] + '?' + link['query'] if len( link['query']) else link['path'] if link['checked']: to_update['spidered'].append({'url': url, 'value': 1}) to_update['code'].append({'url': url, 'value': link['code']}) to_update['time'].append({'url': url, 'value': link['time']}) to_update['size'].append({'url': url, 'value': link['size']}) Urls.update_url_field_mass(pid, host, 'spidered', to_update['spidered']) Urls.update_url_field_mass(pid, host, 'response_code', to_update['code']) Urls.update_url_field_mass(pid, host, 'response_time', to_update['time']) Urls.update_url_field_mass(pid, host, 'size', to_update['size']) skip += len(links) c.up() if len(links) < links_per_time_limit: break
def links_in_spider_base(pid, host): """ Put found links in MySQL """ links_per_time_limit = 50 c = WSCounter(1, 60, int(Registry().get('mongo').spider_urls.count()/links_per_time_limit)) Urls = UrlsModel() host_id = HostsModel().get_id_by_name(pid, host) urls_add = [] skip = 0 while True: links = mongo_result_to_list( Registry().get('mongo').spider_urls.find().skip(skip).limit(links_per_time_limit) ) for link in links: url = link['path'] + '?' + link['query'] if len(link['query']) else link['path'] urls_add.append({ 'url': url, 'referer': link['referer'], 'response_code': link['code'], 'response_time': link['time'], 'size': link['size'], 'who_add': 'spider', 'spidered': link['checked'] }) Urls.add_mass(pid, host_id, urls_add) urls_add = [] to_update = { 'spidered': [], 'code': [], 'time': [], 'size': [] } for link in links: url = link['path'] + '?' + link['query'] if len(link['query']) else link['path'] if link['checked']: to_update['spidered'].append({'url': url, 'value': 1}) to_update['code'].append({'url': url, 'value': link['code']}) to_update['time'].append({'url': url, 'value': link['time']}) to_update['size'].append({'url': url, 'value': link['size']}) Urls.update_url_field_mass(pid, host, 'spidered', to_update['spidered']) Urls.update_url_field_mass(pid, host, 'response_code', to_update['code']) Urls.update_url_field_mass(pid, host, 'response_time', to_update['time']) Urls.update_url_field_mass(pid, host, 'size', to_update['size']) skip += len(links) c.up() if len(links) < links_per_time_limit: break
def links_in_urls_base(pid, host): """ Put links in url_base table (MySQL) for site tree build """ links_per_time_limit = 50 c = WSCounter(1, 60, Registry().get('mongo').spider_urls.count()/links_per_time_limit) UrlsBase = UrlsBaseModel() host_id = HostsModel().get_id_by_name(pid, host) skip = 0 while True: links = mongo_result_to_list( Registry().get('mongo').spider_urls.find().skip(skip).limit(links_per_time_limit) ) for link in links: url = link['path'] + '?' + link['query'] if len(link['query']) else link['path'] UrlsBase.add_url( host_id, url ) skip += len(links) c.up() if len(links) < links_per_time_limit: break
def links_in_urls_base(pid, host): """ Put links in url_base table (MySQL) for site tree build """ links_per_time_limit = 50 c = WSCounter( 1, 60, Registry().get('mongo').spider_urls.count() / links_per_time_limit) UrlsBase = UrlsBaseModel() host_id = HostsModel().get_id_by_name(pid, host) skip = 0 while True: links = mongo_result_to_list( Registry().get('mongo').spider_urls.find().skip(skip).limit( links_per_time_limit)) for link in links: url = link['path'] + '?' + link['query'] if len( link['query']) else link['path'] UrlsBase.add_url(host_id, url) skip += len(links) c.up() if len(links) < links_per_time_limit: break