def getLinks(start_urls, allowed_domains, filename): #Newest code: Scrapydo scrapydo.run_spider(LinkSpider, start_urls=start_urls, allowed_domains=allowed_domains) link_items = removeCycles(start_urls, LinkSpider.link_items) genCSV(filename, start_urls, link_items) return link_items
def DPSSpiderCrawl(self): global DPS DPS = [] scrapydo.run_spider(DPSSpider(), settings={ 'USER_AGENT': 'Mozilla/5.0', }) DPSChunk = SpiderQueen.divide_chunks(DPS) return DPSChunk
def TankSpiderCrawl(self): global Tank Tank = [] scrapydo.run_spider(TankSpider(), settings={ 'USER_AGENT': 'Mozilla/5.0', }) TankChunk = SpiderQueen.divide_chunks(Tank) return TankChunk
def HealSpiderCrawl(self): global Heal Heal = [] scrapydo.run_spider(HealSpider(), settings={ 'USER_AGENT': 'Mozilla/5.0', }) HealChunk = SpiderQueen.divide_chunks(Heal) return HealChunk
def runcrawl(): """ Run a spider within Twisted. Once it completes, wait 5 seconds and run another spider. """ try: scrapydo.run_spider(JubiNoticeSpider) scrapydo.run_spider(YunBiNoticeSpider) except Exception as e: print(e)
def get(self): scrapydo.run_spider(self.spider, crawl_reason='check-vulnerability', injector=self.injector, settings=DVWA_CRAWLER_SETTINGS) storage_adapter = FileAdapter() parser = CheckVulnerabilityParser(self.crawled_site, storage_adapter) vulnerable_pages = parser.check() self.write({'vulnerable': json.dumps(vulnerable_pages)})
def get(self): scrapydo.run_spider(self.spider, crawl_reason='get-db-users', injector=self.injector, settings=DVWA_CRAWLER_SETTINGS) storage_adapter = FileAdapter() parser = GetDbUsersParser(self.crawled_site, storage_adapter) db_usernames = parser.get_db_usernames() self.write(json.dumps(db_usernames))
def get(self): scrapydo.run_spider(self.spider, crawl_reason='get-db-version', injector=self.injector, settings=DVWA_CRAWLER_SETTINGS) storage_adapter = FileAdapter() parser = GetDbVersionParser(self.crawled_site, storage_adapter) db_version = parser.get_db_version() self.write({'db_version': db_version})
def run_crawl(spider): ####################################################################### #定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象# console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') console.setFormatter(formatter) requests_log = logging.getLogger("requests.packages.urllib3") requests_log.setLevel(logging.ERROR) logging.getLogger('').addHandler(console) scrapydo.run_spider(spider_cls=spider)
def crawler_reddit(subreddits): message = '' max = 10 count = 0 data = scrapydo.run_spider(RedditSpider(), settings=settings, subreddits=subreddits) for item in data: count += 1 if item["title"] == '': title = "_No Title_ :(" else: title = item["title"] message += "*" + item["subreddit"] + "*, votes " + str(item["upvote"]) + " " + "[" + title + "](" + \ item["thread_link"] + ") \n" if count > max: break if len(data) == 0: message = "Desculpe, não há nenhum tópico quente nestes seus reddits :(" return message
def start(): spiders = [ UNDPjobSpider, # ESCAPjobsSpider, # UNESCOjobSpider, # ITERJobSpider, # CERNjobsSpider, # # UNIDOjobLink, # UNUjobSpider, # WHOjobSpider, # UNEPJobSpider, # OECDJobSpider, # WIPOjobSpider ] for spider in spiders: scrapydo.run_spider(spider_cls=spider)
def crawl(spider: scrapy.Spider, settings: scrapy.settings.Settings, args: dict) -> scrapy.crawler.Crawler: crawler = scrapydo.run_spider( spider, settings=settings, return_crawler=True, capture_items=True, **args ) return crawler
def test_run_spider(self): TestSpider = make_test_spider(self.url) items = scrapydo.run_spider(TestSpider) self.assertEqual(items, [ { 'path': '/text' }, { 'path': '/html' }, ])
def run_scrapydoProcess(self, spider, index, pages=10): # Run with scrapydo -- setup spider args spider_args = { 'symbol': index, 'pages': pages, 'capture_items': True, 'timeout': 360, 'settings': self.settings } return scrapydo.run_spider(spider, **spider_args)
def start(self): '''使用scrapydo启动爬虫 如有新增爬虫需要启动,可在spiders中添加''' if self.type == 'job': logger.debug("进行岗位准备") spiders = [ # UNDPjobSpider, # CERNjobsSpider, # ITERJobSpider, MOHRSSJobSpider, #唯一国内网站,测试ip效果较好 # OECDJobSpider, # UNIDOjobLink, # UNUjobSpider, # WHOjobSpider, # WIPOjobSpider, ] else: spiders = [] for spider in spiders: scrapydo.run_spider(spider_cls=spider)
def start_scrape(event, context): jobs = [] spider_map = { 'stack_overflow': StackOverflowSpider, # 'dice': DiceSpider } spider_args = {'search_params': event['search_params']} spider = spider_map[event['spider']] scrapydo.run_spider(spider, **spider_args) print("Finished scraping...") with open("/tmp/jobs.json") as job_file: jobs = json.load(job_file) open("/tmp/jobs.json", 'w').close() # Clean the jobs file after we are done with it. return jobs
def pull_content(start_urls): # results = [] # # def crawler_results(signal, sender, item, response, spider): # results.append(item) # # dispatcher.connect(crawler_results, signal=signals.item_passed) # # process = CrawlerProcess() # process.crawl(ContentSpider, start_urls=start_urls) # process.start() # the script will block here until the crawling is finished # results = scrapydo.run_spider(ContentSpider, start_urls=start_urls) return results
def main(argv): scrapydo.setup() settings = Settings() settings.set("USER_AGENT", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)") settings.set("FEED_FORMAT", "json") settings.set("FEED_URI", "result.json") try: opts, args = getopt.getopt(argv, "hs:", ["subreddit="]) except getopt.GetoptError: print( 'cli_crawler.py -s <lista de subreddit separado por vírgula, ex. programming;dogs;brazil>' ) sys.exit(2) if len(opts) == 0: print( 'cli_crawler.py -s <lista de subreddit separado por vírgula, ex. programming;dogs;brazil>' ) for opt, arg in opts: if opt == '-s': subreddits = arg print("Iniciando crawler para buscar dados dos subreddits " + subreddits + "...") data = scrapydo.run_spider(RedditSpider(), settings=settings, subreddits='askreddit') for item in data: if item["title"] == '': title = "_No Title_ :(" else: title = item["title"] message = item["subreddit"] + ", votes " + str(item["upvote"]) + " " + "[" + title + "](" + \ item["thread_link"] + ") \n" print(message) sys.exit()
def my_crawl(): items = scrapydo.run_spider(RunnerSpider) text_file = open("tmp.json", "w") text_file.write("%s" % items) text_file.close() f1 = open('tmp.json', 'r') f2 = open('tmp_result.json', 'w') for line in f1: f2.write(line.replace("'", '"')) f1.close() f2.close() f3 = open('tmp_result.json', 'r') f4 = open('result.json', 'w') for line in f3: f4.write(line.replace('u"', '"')) f3.close() f4.close()
def Queen(self): global rankings rankings = {"DPS": [], "Heal": [], "TDPS": []} scrapydo.run_spider(DPSSpider(), settings={ 'USER_AGENT': 'Mozilla/5.0', }) scrapydo.run_spider(TankSpider(), settings={ 'USER_AGENT': 'Mozilla/5.0', }) scrapydo.run_spider(HealSpider(), settings={ 'USER_AGENT': 'Mozilla/5.0', }) return rankings
os.chdir(sys.path[0]) reload(sys) sys.setdefaultencoding('utf-8') if not os.path.exists('log'): os.makedirs('log') logging.basicConfig(filename='log/proxy.log', format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG) sql = SqlHelper() while True: utils.log('*******************run spider start...*******************') command = 'DELETE FROM {0} WHERE save_time < now() - {1}'.format( config.free_ipproxy_table, 1800) sql.execute(command) items = scrapydo.run_spider(XiCiDaiLiSpider) items = scrapydo.run_spider(SixSixIpSpider) items = scrapydo.run_spider(IpOneEightOneSpider) items = scrapydo.run_spider(KuaiDaiLiSpider) items = scrapydo.run_spider(GatherproxySpider) utils.log( '*******************run spider waiting...*******************') time.sleep(300)
logging.basicConfig( filename='log/crawl_proxy.log', format='%(levelname)s %(asctime)s: %(message)s', level=config.log_level ) sql = SqlManager() spiders = [ XiCiDaiLiSpider, SixSixIpSpider, IpOneEightOneSpider, # KuaiDaiLiSpider, # 使用其他方法 # GatherproxySpider, HidemySpider, ProxylistplusSpider, # FreeProxyListsSpider, # PeulandSpider, # 目标站点失效 # UsProxySpider, ProxyDBSpider, # ProxyRoxSpider, ] while True: utils.log('*******************run spider start...*******************') sql.delete_old(config.free_ipproxy_table, 0.5) for spider in spiders: scrapydo.run_spider(spider_cls=spider) utils.log('*******************run spider waiting...*******************') time.sleep(1200)
sql = SqlHelper() spiders = [ XiCiDaiLiSpider, SixSixIpSpider, IpOneEightOneSpider, KuaiDaiLiSpider, # 在访问前加了一个 js ,反爬 GatherproxySpider, HidemySpider, ProxylistplusSpider, FreeProxyListsSpider, # PeulandSpider, # 目标站点失效 UsProxySpider, ProxyDBSpider, ProxyRoxSpider, ] while True: utils.log('*******************run spider start...*******************') command = "DELETE FROM {table} where save_time < SUBDATE(NOW(), INTERVAL 0.2 DAY)".format( table = config.free_ipproxy_table) sql.execute(command) for spider in spiders: scrapydo.run_spider(spider) utils.log('*******************run spider waiting...*******************') time.sleep(1200)
def update_db(): while (True): scrapydo.run_spider(FolhaSpider) scrapydo.run_spider(EconomiaSpider) sleep(120)
reload(sys) sys.setdefaultencoding('utf-8') if not os.path.exists('log'): os.makedirs('log') logging.basicConfig(filename='log/validator.log', format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG) while True: utils.log( '----------------validator start time:%s...-----------------------' % datetime.datetime.now().strftime('%Y:%m:%d %H:%M:%S:%f')) items = scrapydo.run_spider(HttpBinSpider) utils.log( '----------------validator finish:%s time:%s-----------------------' % (HttpBinSpider.name, datetime.datetime.now().strftime('%Y:%m:%d %H:%M:%S:%f'))) time.sleep(10) items = scrapydo.run_spider(DoubanSpider) utils.log( '----------------validator finish:%s time:%s-----------------------' % (DoubanSpider.name, datetime.datetime.now().strftime('%Y:%m:%d %H:%M:%S:%f'))) # items = scrapydo.run_spider(GatherSpider) # items = scrapydo.run_spider(AssetStoreSpider)
def test_run_spider(self): TestSpider = make_test_spider(self.url) items = scrapydo.run_spider(TestSpider) self.assertEqual(items, [{"path": "/text"}, {"path": "/html"}])
from ipproxytool.spiders.validator.assetstore import AssetStoreSpider from ipproxytool.spiders.validator.gather import GatherSpider scrapydo.setup() if __name__ == '__main__': os.chdir(sys.path[0]) reload(sys) sys.setdefaultencoding('utf-8') if not os.path.exists('log'): os.makedirs('log') logging.basicConfig(filename='log/validator.log', format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG) utils.make_dir('log') while True: utils.log('----------------validator start...-----------------------') items = scrapydo.run_spider(DoubanSpider) # items = scrapydo.run_spider(GatherSpider) # items = scrapydo.run_spider(AssetStoreSpider) utils.log( '*************************validator waiting...*************************' ) time.sleep(60)
"max": _max }) print link["district"], link["location"], "均价:", _avg, "最低:", _min, "最高:", _max if __name__ == "__main__": logging.basicConfig( filename='spider.log', format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG ) # while True: mongo = MongoDBPipeline() t = time.time() # 1、爬取连接,并更新district scrapydo.run_spider(LinkSpider) # 2、爬取item while True: print "爬取房源中....." scrapydo.run_spider(ItemSpider) if mongo.get_failed_urls().count() == 0: break print "开始再次爬取房源...." print "爬取结束, 耗时%d秒" % (time.time() - t) # 3、根据location的名字进行统计 print "开始统计..." summarize()
def test_run_spider(self): TestSpider = make_test_spider(self.url) items = scrapydo.run_spider(TestSpider, name='test-spider') self.assertEqual(set(it['path'] for it in items), {'/text', '/html'})
logging.basicConfig( level=logging.INFO, # set the log level to INFO level format='%(levelname)s :%(asctime)s: %(message)s', filename='crawl-%s.log' % datetime.now().strftime('%Y%m%d'), ) while True: if printer_status: print('the spider started at %s' % datetime.now().strftime('%Y-%m-%d %H:%M:%S')) logging.info('the spider started') scrapydo.default_settings.update({ 'CONCURRENT_REQUESTS': 100, # the num of concurrent_requests 'CONCURRENT_ITEMS': 500, # the num of CONCURRENT_ITEMS }) scrapydo.run_spider(spider_cls=DoubanmovieSpider) if printer_status: print('the spider stopped at %s' % datetime.now().strftime('%Y-%m-%d %H:%M:%S')) logging.info('the spider stopped normally') # disconnect the pptp vpn and wait for the change of ip VPN.disconnect() while VPN.getstatus() == 1: VPN.connect() sleep(3) if os.path.exists('status.txt'): with open('status.txt') as file: if file.readline().strip() == 'exit': logging.info('the crawl exited gracefully')
def jobhunt(event, context): scrapydo.setup() settings = get_project_settings() scrapydo.run_spider(spider_dictionary[event['name']], settings=settings)
async def get(self, *args, **kwargs): return_developing() scrapydo.run_spider(XicidailiSpider) self.do_success({'ok': 1}, 'todo')