# -*- coding: utf-8 -*- """ Pravda news articles spy """ from grab.tools.logs import default_logging from spiders.pravda_archive import PravdaArchiveSpider from config import default_spider_params if __name__ == '__main__': default_logging() print "Scape python projects" bot = PravdaArchiveSpider(**default_spider_params()) bot.setup_grab(timeout=4096, connect_timeout=10) bot.run() print bot.render_stats()
# -*- coding: utf-8 -*- from random import randint from grab.tools.logs import default_logging from spiders.gis import StoreSpider from config import default_spider_params def my_headers(): return { 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.%d' % randint(2, 5), 'Accept-Language': 'ru-ru,ru;q=0.%d' % (randint(5, 9)), 'Accept-Charset': 'utf-8,windows-1251;q=0.7,*;q=0.%d' % randint(5, 7), 'Keep-Alive': '300', 'Expect': '', } if __name__ == '__main__': default_logging() bot = StoreSpider(**default_spider_params()) bot.setup_grab(timeout=4096, connect_timeout=10, headers=my_headers()) bot.run() print bot.render_stats()
# -*- coding: utf-8 -*- from random import randint from grab import Grab from grab.spider import Spider, Task from grab.tools.logs import default_logging from spiders.simple import SimpleSpider from config import default_spider_params, Session def my_headers(): return { 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.%d' % randint(2, 5), 'Accept-Language': 'ru-ru,ru;q=0.%d' % (randint(5, 9)), 'Accept-Charset': 'utf-8,windows-1251;q=0.7,*;q=0.%d' % randint(5, 7), 'Keep-Alive': '300', 'Expect': '', } if __name__ == '__main__': default_logging() bot = SimpleSpider(**default_spider_params()) bot.setup_grab(timeout=4096, connect_timeout=10, headers=my_headers()) bot.run() print bot.render_stats()
from optparse import OptionParser from grab import Grab from grab.spider import Spider, Task from grab.tools.logs import default_logging from spiders.explore import ExploreSpider from spiders.lang_python import LangPythonSpider from config import default_spider_params, Session if __name__ == '__main__': default_logging() parser = OptionParser() # command line options parser.add_option("-p", "--python", action="store_true", dest="parse_python", default=False) options, args = parser.parse_args() if options.parse_python: print "Scape python projects" bot = LangPythonSpider(**default_spider_params()) else: print "Scrape trandings" bot = ExploreSpider(**default_spider_params()) bot.setup_proxylist('/var/proxylist.txt', 'http', auto_change=True) bot.setup_grab(timeout=4096, connect_timeout=10) bot.run() print bot.render_stats()