dest='robots', default='/lm/data2/scrapers/eng-USA/epg/www.tv.com' '/log/robots.zip', help='Set the robots for robots.zip file [Default %default]' ) parser.add_option( '--basepath', '-b', dest='basepath', default='/lm/data2/', help='Set the basepath for outputfile location [Default %default]' ) options, args = parser.parse_args() log = Logger(options.debug) myScraper = WebScraper( scraperType = 'scrapers', topic = 'urls', lang = 'xxx-XXX', name = 'gatherproxy.com', frequency = 'inc', ) if options.robots: # set the robots.txt for the scraper myScraper.setRobotsTxt(url='http://gatherproxy.com/', zip=options.robots) myScraper.setBasePath(options.basepath)
dest='badUrlsFile', default='/lm/data2/scrapers/zho-CHN/movies/v.qq.com.movie' '/log.inc/v.qq.com.movie.badUrls.lst', help= 'Prints unusable URLs to external file instead of halting the scraper.' ) parser.add_option( '--small', action='store_true', dest='run_small', default=False, help='if run spider by small data set, this is for debug.') options, args = parser.parse_args() log = Logger(options.debug) if options.run_small: run_small = options.run_small if options.html: myScraper = HTMLScraper(scraperType=u'scrapers', topic=u'movies', lang=u'zho-CHN', name=u'v.qq.com.movie', frequency=u'versions') myScraper.inputDataBall(options.html) else: myScraper = WebScraper(scraperType=u'scrapers', topic=u'movies', lang=u'zho-CHN', name=u'v.qq.com.movie',