def service_sis(self): process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(worker.Worker) process.start() # the script will block here until the crawling is finished
class CrawlerWorker(multiprocessing.Process): def __init__(self, spider, result_queue): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) #__init__ def _item_passed(self, item): self.items.append(item) # _item_passed def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.result_queue.put(self.items) #run
class BaseScraper(CrawlSpider): name = "base" rules = ( Rule(LinkExtractor(allow=()), callback='parse_item'), ) def __init__(self, index, start_urls, allowed_domains=[], *args, **kwargs): self.allowed_domains = allowed_domains self.start_urls = start_urls self.index = index super(BaseScraper, self).__init__(*args, **kwargs) def parse_item(self, response): item = {} item["body"] = response.body yield item # Instantiates a CrawlerProcess, which spins up a Twisted Reactor. def connect(self): self.process = CrawlerProcess(get_project_settings()) # Start the scraper. The crawl process must be instantiated with the same # attributes as the instance. def start(self): self.connect() self.process.crawl( self.name, self.index, start_urls = self.start_urls, allowed_domains = self.allowed_domains, ) self.process.start()
def main(argv): try: opts, args = getopt.getopt(argv, "ch:t:s:", ['title=', 'section=']) except getopt.GetoptError: print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'Usage:\npython2.7 decc.py -h(help)\npython2.7 decc.py -c(crawl articles)\npython2.7 decc.py -s(search article by section) <section>\npython2.7 decc.py -t(search article by title) <title>' sys.exit() elif opt == '-c': # start crawling article here print "crawling" process = CrawlerProcess(get_project_settings()) process.crawl(BBCArticleSpider) process.start() elif opt in ('-t', '--title'): print "search by title" # start searching article by title results = BBCArticleItem.fetch_by_title(arg) for result in results: print result elif opt in ('-s', '--section'): print "search by section" # start searching article by section results = BBCArticleItem.fetch_by_section(arg) for result in results: print result
def crawl(ctx, spiders, stats): """ Crawl one or many or all pages. What spider(s) to run is determined in the following order: 1. Spider(s) given as argument(s) 2. Spider(s) specified in the configuration file Note that if a spider is given as an argument, the spiders in the configuration file are ignored. All available spiders will be used to crawl if no arguments are given and no spiders are configured. """ settings = ctx.obj["settings"] if stats: settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector") # Start a new crawler process. process = CrawlerProcess(settings) spiders = spiders_to_crawl(process, spiders) if not spiders: logger.error("Please specify what spiders you want to run!") else: for spider in spiders: logger.info("Starting crawl of {} ...".format(spider)) process.crawl(spider) process.start() if settings.getbool("HTTPCACHE_ENABLED"): run_cleanup_cache(settings)
def __init__(self, titlesfile = None, platform = None, region = None): # set default encoding to utf8 for parsing and logging # utf-8 characters in console and files # reload(sys) sys.setdefaultencoding('utf8') configure_logging(install_root_handler=False) logging.basicConfig( filename='export.log', filemode = 'a', format='%(levelname)s: %(message)s', level=logging.INFO ) # identify platform # self.platform = platform if self.platform is None: logging.error('No platform found! Pass it as an argument.') return else: platformId = platforms.getId(self.platform) if platformId is None: logging.error('Platform ' + self.platform + ' not supported.') return self.titlesfile = titlesfile self.region = region if self.region is None: self.region = "Worldwide" if titlesfile: titles = [] urls = [] with open( self.titlesfile ) as f: titles = f.read().splitlines() for title in titles: logging.debug('Submitting title:' + title ) urls.append( 'http://mobygames.com/search/quick' + '?q=' + title + '&p=' + platformId + '&search=Go' '&sFilter=1' '&sG=on' '&search_title=' + urllib.quote( title ) + '&search_platform=' + urllib.quote(self.platform) + '&search_region=' + urllib.quote(self.region) ) process = CrawlerProcess(get_project_settings()) process.crawl(MobygamesSpider, start_urls=urls) process.start() else: logging.warning('No file.')
def run_spider(): settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36") settings.set("ITEM_PIPELINES", { 'pipelines.FilterProxyPipline': 1, 'pipelines.SaveProxyPipeline': 2 }) settings.set("LOG_STDOUT ", True) # 配置日志记录规则设置 # configure_logging({ # 'filename': datetime.now().strftime('%Y_%m_%d_%H_proxy.log'), # 'format': '%(asctime)s %(levelname)-8s %(name)-15s %(message)s', # 'level': logging.INFO # }) configure_logging(install_root_handler=False) # 初始化日志路径 logpath = datetime.now().strftime(log_path) if not os.path.isdir(logpath): os.makedirs(logpath) logging.basicConfig( filename=datetime.now().strftime('%s/%s_proxy.log' % (logpath, log_file)), format=log_format, level=logging.INFO ) process = CrawlerProcess(settings) process.crawl(GetProxySpider) process.start()
def handle(self, *args, **options): setting = { 'USER_AGENT': options['user_agent'], 'DOWNLOAD_DELAY': options['download_delay'], 'LOG_FILE': settings.SCRAPY_LOG_FILE, 'LOG_LEVEL': settings.SCRAPY_LOG_LEVEL, } if options['proxy_list']: try: f = open(options['proxy_list']) except IOError as e: raise CommandError('cannot open proxy list file for read') # Retry many times since proxies often fail setting['RETRY_TIMES'] = 10 # Retry on most error codes since proxies fail for different reasons setting['RETRY_HTTP_CODES'] = [500, 503, 504, 400, 403, 404, 408] setting['DOWNLOADER_MIDDLEWARES'] = { 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, 'spider.randomproxy.RandomProxy': 100, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, } setting['PROXY_LIST'] = options['proxy_list'] process = CrawlerProcess(setting) process.crawl(BaiduSpider) process.start()
def news_flash_crawl(rss_link, site_name, maps_key): id_flash = get_latest_id_from_db() + 1 latest_date = get_latest_date_from_db() d = feedparser.parse(rss_link) process = CrawlerProcess() for entry in d.entries[::-1]: entry_parsed_date = datetime.strptime(entry.published[:-6], '%a, %d %b %Y %H:%M:%S') entry_parsed_date = entry_parsed_date.replace(tzinfo=None) if (latest_date is not None and entry_parsed_date > latest_date) or latest_date is None: news_item = {'id_flash': id_flash, 'date_parsed': entry_parsed_date, 'title': entry.title, 'link': entry.links[0].href, 'date': entry.published, 'location': '', 'lat': 0, 'lon': 0} if (u'תאונ' in entry.title and u'תאונת עבודה' not in entry.title and u'תאונות עבודה' not in entry.title)\ or ((u'רכב' in entry.title or u'אוטובוס' in entry.title or u"ג'יפ" in entry.title or u'משאית' in entry.title or u'קטנוע' in entry.title or u'אופנוע' in entry.title or u'אופניים' in entry.title or u'קורקינט' in entry.title or u'הולך רגל' in entry.title or u'הולכת רגל' in entry.title or u'הולכי רגל' in entry.title) and (u'נפגע' in entry.title or u'פגיע' in entry.title or u'נפצע' in entry.title or u'פציע' in entry.title or u'התנגש' in entry.title or u'התהפך' in entry.title or u'התהפכ' in entry.title)): news_item['accident'] = True else: news_item['accident'] = False if site_name == 'ynet': news_item['source'] = 'ynet' process.crawl(YnetFlashScrap, entry.links[0].href, news_item=news_item, maps_key=maps_key) id_flash = id_flash + 1 process.start()
def main(tabLink): if(tabLink.find("ultimate-guitar.com")): tabSpider = Spiders.Ultimate(tabLink) elif(tabLink.find("guitartabs.cc")): tabSpider = Spiders.TabCC(tabLink) else: print("Domain name not supported.") return # Make a process to instantiate a Ultimate spider with the given # arguments and make it crawl the link process = CrawlerProcess(get_project_settings()) process.crawl(tabSpider, link=tabLink) process.start() # Link has been scraped, now process it tree = xmltree.parse(tabs.pipelines.filename) root = tree.getroot() value = root[0][0][0] rawTab = value.text if("\M" in rawTab): rawTab = parsefuncs.removeLineEndings(rawTab) cleanTab = parsefuncs.parseTab(rawTab) print("Clean tab is:") count = 0 for line in cleanTab: count += 1 print line if(count % 6 == 0): print(" ")
def run(self): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('stackoverflow', ) process.start()
def ScrapeSite(): db = 'crunchbase_startups' sitedomain = raw_input("Enter site domain: ") # get user input sitedomain = parse_base_url(sitedomain) # clean url sql = 'SELECT text FROM {} WHERE siteurl = %s'.format(db) cur.execute(sql, sitedomain) sitetext = cur.fetch() if sitetext != '': # what does an empty ping return? print 'Site already scraped.' return sitetext process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'ITEM_PIPELINES': {'pipelines.UserInputPipeline': 100}, 'DEPTH_LIMIT': 2, 'DOWNLOAD_HANDLERS': {'s3': None,} ,'LOG_LEVEL': 'INFO' }) process.crawl(SoloSpider, domain = sitedomain) process.start() # presumably finished here - pull newly loaded sitetext for domain cur.execute(sql, sitedomain) return cur.fetch()
def get(self): while True: process = CrawlerProcess(get_project_settings()) process.crawl('iqiyi') process.start() time.sleep(3000) self.finish()
def execute(argv=None): if argv is None: argv = sys.argv crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def handle(self, *args, **options): # It would be better to pass this in as a parameter to PayoutSpider global start_date start_date = datetime.datetime(2015, 1, 1, tzinfo=pytz.UTC) delete = options.get('delete') delete_all = options.get('delete_all') retrieve_all = options.get('retrieve_all') previous_payout = None previous_payouts = codementor_models.Payout.objects.all().order_by('-date') if delete_all or (delete and previous_payouts.count() == 0): codementor_models.Review.objects.all().delete() codementor_models.Session.objects.all().delete() codementor_models.Payout.objects.all().delete() codementor_models.Payment.objects.all().delete() elif delete: previous_payout = previous_payouts[0] codementor_models.Review.objects.filter(date__gt=start_date).delete() codementor_models.Session.objects.filter(started_at__gt=start_date).delete() previous_payout.delete() codementor_models.Payment.objects.filter(payout__isnull=True).delete() if not retrieve_all and previous_payout: start_date = previous_payout.date process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(PayoutSpider) process.start()
def run(urls, city): process = CrawlerProcess() spiders = [make_spider(artist, url, city) for artist, url in urls] for spider_cls in spiders: process.crawl(spider_cls) # the script will block here until the crawling is finished process.start()
def get_scraped_sites_data(): """Returns output for venues which need to be scraped.""" class RefDict(dict): """A dictionary which returns a reference to itself when deepcopied.""" def __deepcopy__(self, memo): return self # Hack: we pass a dictionary which can't be deep-copied into the settings # so as to _return_ the scraper output. As far as I can tell, this is the # only way to return the scraper output to the script itself. output = RefDict() settings = Settings({ 'LOG_ENABLED': False, 'ITEM_PIPELINES': { 'mgrok.pipelines.JsonWriterPipeline': 1 }, 'PIPELINE_OUTPUT': output, 'USER_AGENT': 'Chrome/41.0.2228.0' }) crawler_process = CrawlerProcess(settings) for spider in SCRAPY_SPIDERS: crawler_process.crawl(spider) crawler_process.start() return output
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback): """ Launch crawl job for JobSpider class :param debug: (bool) Activate or disable debug :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error) :param connector: Connector instance :param spiders_classes: JobSpider class list :return: spider instance """ if debug: dispatcher.connect(spider_error_callback, signals.spider_error) process = CrawlerProcess({ 'ITEM_PIPELINES': { 'pyjobs_crawlers.pipelines.RecordJobPipeline': 1, }, 'connector': connector, 'LOG_ENABLED': False }) for spider_class in spiders_classes: process.crawl(spider_class) spiders = [] for crawler in list(process.crawlers): spiders.append(crawler.spider) process.start() return spiders
def spiderCrawl(bandname): createLink(bandname) settings = get_project_settings() settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)') process = CrawlerProcess(settings) process.crawl(MySpider) process.start()
def Test_Scapy(self): spider = FtpSpider() process = CrawlerProcess({"USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"}) process.crawl(spider) process.start()
class CrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) #if not hasattr(project, 'crawler'): #self.crawler.install() #self.crawler.configure() self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def _crawl(self, queue, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.crawler.queue.append_spider(spider) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, spider): queue = Queue() p = Process(target=self._crawl, args=(queue, spider,)) p.start() p.join() return queue.get(True)
def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str): busy.value = 1 if os.path.exists("data.json"): os.remove("data.json") print("Started crawling task") process = CrawlerProcess(get_project_settings()) process.crawl("od_links", base_url=website.url) process.start() print("Done crawling") self.db.import_json("data.json", website) os.remove("data.json") print("Imported in SQLite3") if post_id: # Reply to post stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"": stats}, website.id) print(comment) if "total_size" in stats and stats["total_size"] > 10000000: post = self.reddit_bot.reddit.submission(post_id) self.reddit_bot.reply(post, comment) pass else: self.reddit_bot.log_crawl(post_id) elif comment_id: # Reply to comment stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id) print(comment) reddit_comment = self.reddit_bot.reddit.comment(comment_id) self.reddit_bot.reply(reddit_comment, comment) busy.value = 0 print("Done crawling task")
def get_fetch(log=False): settings = Settings() settings.set('LOG_ENABLED', log) crawler_process = CrawlerProcess(settings) crawler = crawler_process.create_crawler() crawler_process.start_crawling() t = Thread(target=crawler_process.start_reactor) t.daemon = True t.start() shell = Shell(crawler) shell.code = 'adsf' import threading lock = threading.Lock() def fetch(url_or_request): lock.acquire() try: shell.fetch(url_or_request) response = shell.vars.get('response') return response finally: lock.release() return fetch
def magic(): process = CrawlerProcess(get_project_settings()) # 'followall' is the name of one of the spiders of the project. process.crawl('magic') process.start() # the script will block here until the crawling is fini
def scrape(spider): with transaction.atomic(), reversion.create_revision(): process = CrawlerProcess(DEFAULT_CRAWLER_OPTIONS) process.crawl(spider) # the script will block here until the crawling is finished process.start() return
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.start() # schedule spider # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def crawl(spiders_classes, connector, debug=False, spider_error_callback=stdout_error_callback, scrapy_settings=None): """ Launch crawl job for JobSpider class :param scrapy_settings: dict of setting merged with CrawlerProcess default settings :param debug: (bool) Activate or disable debug :param spider_error_callback: callback foir spider errors (see http://doc.scrapy.org/en/latest/topics/signals.html#spider-error) :param connector: Connector instance :param spiders_classes: JobSpider class list :return: spider instance """ if debug: dispatcher.connect(spider_error_callback, signals.spider_error) settings = { 'ITEM_PIPELINES': { 'pyjobs_crawlers.pipelines.RecordJobPipeline': 1, }, 'connector': connector, 'LOG_ENABLED': False, 'DOWNLOAD_DELAY': 1 if not debug else 0, } if scrapy_settings: settings.update(scrapy_settings) process = CrawlerProcess(settings) for spider_class in spiders_classes: process.crawl(spider_class, debug=debug) spiders = [] for crawler in list(process.crawlers): spiders.append(crawler.spider) process.start() return spiders
def main(): settings = get_project_settings() # TODO: Initialize item pipelines # settings.set('ITEM_PIPELINES', {'Program.Scrapy.Items.HikerJournalWriterPipeline': 2}) crawler = CrawlerProcess(settings=settings) spider = HikerScraper() crawler.crawl(spider, domain="http://www.trailjournals.com") crawler.start()
def scrape_task(): """Celery task to scrape website with Scrapy. http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script """ process = CrawlerProcess(get_project_settings()) process.crawl('eslcafe', domain='eslcafe.com') process.start()
def crawl_Info(): """ 该函数用于从http://www.ishadowsocks.com/网站上爬取免费SHADOWSOCKS帐号 结果存储在Result.json文件中 """ process = CrawlerProcess(get_project_settings()) process.crawl('SSSpider') process.start()
def _crawl(path=None): crawl = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) crawl.crawl(ProvinceSpider) crawl.start() crawl.stop()
css_selector = '//img' for x in response.xpath(css_selector): newsel = '::attr(src)' links = x.css(newsel).extract_first() if links.endswith(".jpg"): yield {'Image Link': links} #this is to check the next page for images and scrap them out Page_selector = '.next a ::attr(href)' NextPage = response.css(Page_selector).extract_first() if NextPage: yield scrapy.Request(response.urljoin(NextPage), callback=self.parse) process = CrawlerProcess({'FEED_FORMAT': 'json', 'FEED_URI': 'results.json'}) process.crawl(NewSpider) # Selecting spider class process.start() # #to read the results.json file and display it when run with open('results.json', 'rt') as filehandle: lines = filehandle.readlines()[1:15] for line in lines: print(website + line.replace('{"Image Link": "', "").replace( '"}', "").replace(",", "")) #To Display the Website testurls = ['http://172.18.58.238/hr2/'] import webbrowser for url in testurls:
img_path = 'images/' + str("_".join(data['name'].split())) + '.jpg' with open(img_path, 'wb') as handle: response = requests.get(img_url, stream=True) if not response.ok: print(response) for block in response.iter_content(1024): if not block: break handle.write(block) data['image_path'] = img_path else: data['image_path'] = None yield data process = CrawlerProcess(settings={ 'FEED_FORMAT': 'csv', 'FEED_URI': 'celeb.csv' }) process.crawl(CelebSpider) process.start() total = 100 + 799 + 925 df = pd.read_csv('celeb.csv') print("percentage found = {}%".format(len(df) / total))
extractor = LinkExtractor(allow=("presse"), allow_domains='hessen.de') links = extractor.extract_links(response) extractor = LinkExtractor( deny_domains=('www.hessen.de', 'facebook.com', 'youtube.com', 'twitter.com', 'instagram.com', 'radroutenplaner.hessen.de')) linksext = extractor.extract_links(response) for link in linksext: yield { 'from': response.url, 'url': link.url, 'text': link.text.strip() } for link in links: absolute_next_page_url = response.urljoin(link.url) yield scrapy.Request(absolute_next_page_url) c = CrawlerProcess({ 'USER_AGENT': 'HochschuleDarmstadt-TextWebMining', 'FEED_FORMAT': 'csv', 'FEED_URI': '/media/sf_Shared/Git/data/HessenPresse.csv', 'DOWNLOAD_DELAY': 1, 'ROBOTSTXT_OBEY': True, 'HTTPCACHE_ENABLED': True }) c.crawl(HessenSpider) c.start() # the script will block here until the crawling is finished
def run_spider(spiders): process = CrawlerProcess(get_project_settings()) for spider in spiders: process.crawl(spider) process.start()
# Narrow in on the course blocks course_blocks = response.css('div.course-block') # Direct to the course links course_links = course_blocks.xpath('./a/@href') # Extract the links (as a list of strings) links_to_follow = course_links.extract() # Follow the links to the next parser for url in links_to_follow: yield response.follow(url=url, callback=self.parse_pages) def parse_pages(self, response): """Code to parse course pages""" # Direct to the course title text crs_title = response.xpath( '//h1[contains(@class,"header-hero__title")]/text()') # Extract and clean the course title text crs_title_ext = crs_title.extract_first().strip() # Direct to the chapter titles text ch_titles = response.css('h4.chapter__title::text') # Extract and clean the chapter titles text ch_titles_ext = [t.stip() for t in ch_titles.extract()] # Store this in our dictionary dc_dict[crs_title_ext] = ch_titles_ext dc_dict = dict() process = CrawlerProcess() process.crawl(DC_Chapter_Spider) process.start()
c, flags=re.S)[0] s = json.loads(s) content = s['detail']['content'] print(content) # 配置在单脚本情况也能爬取的脚本的备选方案,使用项目启动则下面的代码无效 if __name__ == '__main__': import os, time from scrapy.crawler import CrawlerProcess timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) # 年月日_时分秒 filename = 'v{}.json'.format(timestamp) # 这是输出文件名字(解开 'FEED_URI' 配置注释生效) jobdir = 'JOBDIR/JKzECrMyDU' # 这是队列信息地址(解开 'JOBDIR' 配置注释生效) p = CrawlerProcess({ 'TELNETCONSOLE_ENABLED': False, # 几乎没人使用到这个功能,直接关闭提高爬虫启动时间 'MEDIA_ALLOW_REDIRECTS': True, # 允许图片下载地址重定向,存在图片下载需求时,请尽量使用该设置 'LOG_LEVEL': 'INFO', # DEBUG , INFO , WARNING , ERROR , CRITICAL # 'JOBDIR': jobdir, # 解开注释则增加断点续爬功能 # 任务队列、任务去重指纹、任务状态存储空间(简单来说就是一个文件夹) # 'FEED_URI': filename, # 下载数据到文件 # 'FEED_EXPORT_ENCODING': 'utf-8', # 在某种程度上,约等于 ensure_ascii=False 的配置选项 # 'FEED_FORMAT': 'json', # 下载的文件格式,不配置默认以 jsonlines 方式写入文件, # 支持的格式 json, jsonlines, csv, xml, pickle, marshal # 'DOWNLOAD_TIMEOUT': 8, # 全局请求超时,默认180。也可以在 meta 中配置单个请求的超时( download_timeout ) # 'DOWNLOAD_DELAY': 1, # 全局下载延迟,这个配置相较于其他的节流配置要直观很多 }) p.crawl(VSpider) p.start()
items["content"] = cons items["images"] = "" items["release_time"] = time.strftime("%Y-%m-%d") items["qa"] = "" items["source"] = urlparse(response.url).netloc items["author"] = "" items["url"] = response.url items["entity"] = "" items["label"] = [] items["summary"] = [] items["time_stamp"] = int(time.time()) items["priority"] = 0 items["nlp_state"] = 0 items["static_page"] = 0 s1 = {'hotword_id': hotword_id} s2 = {'$set': {'article_state': 1}} self.client.dailypops.hotword.update(s1, s2) yield items def md5_(self, str): md5 = hashlib.md5() data = str md5.update(data.encode('utf-8')) return md5.hexdigest() if __name__ == '__main__': chinadaily = CrawlerProcess() chinadaily.crawl(Chinadaily) chinadaily.start()
# 'addressStreet': item['addressStreet'], # 'addressState': item['addressState'], # 'addressCity': item['addressCity'], # 'addressZipcode': item['addressZipcode'], # # 'description': item['description'], # 'beds': item['beds'], # 'baths': item['baths'], # 'area': item['area'], # 'latitude': item['latLong']['latitude'], # 'longitude': item['latLong']['longitude'], # # 'brokerName': item['brokerName'], # # 'brokerPhone': item['brokerPhone'], # 'yearBuilt': item['hdpData']['homeInfo']['yearBuilt'], # 'lotSize': item['hdpData']['homeInfo']['lotSize'], # 'homeType': item['hdpData']['homeInfo']['homeType'], # 'homeStatus': item['hdpData']['homeInfo']['homeStatus'], # 'zestimate': item['hdpData']['homeInfo']['zestimate'], # # 'rentZestimate': item['hdpData']['homeInfo']['rentZestimate'], # 'festimate': item['hdpData']['homeInfo']['festimate'], # 'hiResImageLink': item['hdpData']['homeInfo']['hiResImageLink'], } # main driver if __name__ == '__main__': # run spider process = CrawlerProcess() process.crawl(ZillowSpider) process.start() # debug data extraction logic # ZillowSpider.parse(ZillowSpider, '')
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from leroymerlin import settings from leroymerlin.spiders.leroy import LeroySpider if __name__ == '__main__': crawler_settings = Settings() crawler_settings.setmodule(settings) process_object = CrawlerProcess(settings=crawler_settings) process_object.crawl(LeroySpider, category='молоток') process_object.start()
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from gbparse import settings #from gbparse.spiders.avito import AvitoSpider #from gbparse.spiders.geekbrains import GeekbrainsSpider #from gbparse.spiders.hhru import HhruSpider from gbparse.spiders.vk import VkSpider if __name__ == '__main__': scr_settings = Settings() scr_settings.setmodule(settings) process = CrawlerProcess(settings=scr_settings) # process.crawl(AvitoSpider) # process.crawl(GeekbrainsSpider) # process.crawl(HhruSpider) process.crawl(VkSpider) process.start()
r = r"url\('(.*)'\)" name = "Name :" + html_soup.select(NAME_SELECTOR)[0].text author = "Author :" + html_soup.select(AUTHOR_SELECTOR)[0].text image_url = re.findall( r, html_soup.select(IMAGE_SELECTOR)[0].attrs['style'])[0] with open(basename(image_url), "wb") as f: f.write(get(image_url).content) window["-NAME-"].update(name) window["-AUTHOR-"].update(author) im = Image.open(basename(image_url)) im.save('temp.png') window["-IMAGE-"].update(filename='temp.png') elif event == "DOWNLOAD": process = CrawlerProcess() process.crawl(GetLightNovelSpider, start_url=url, author=author, name=name, html_soup=html_soup) process.start() process.stop() elif event == "UPDATE": spider = UpdateLightNovel(name[6:] + '.epub') spider.update()
# import the spiders you want to run from spiders.toscrape import ToScrapeSpider from spiders.toscrape2 import ToScrapeSpiderTwo # scrapy api imports # from scrapy import signals, log from scrapy import signals import logging from twisted.internet import reactor # from scrapy.crawler import Crawler from scrapy.crawler import CrawlerProcess # from scrapy.crawler import CrawlerRunner from scrapy.settings import Settings process = CrawlerProcess() process.crawl(ToScrapeSpider) process.crawl(ToScrapeSpiderTwo) process.start( ) # the script will block here until all crawling jobs are finished
def _setup_process(self): self.now = datetime.now(self.settings.timezone).strftime(TS_FORMAT) self.process = CrawlerProcess(self._crawler_options()) for site in self.sites: self.process.crawl(Spider, settings=site.settings, now=self.now)
#!/usr/bin/env python3 import scrapy class MySpider(scrapy.Spider): name = 'myspider' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # get list or use empty list # (as default it would return `None` but `start_urls` has to be list) self.start_urls = kwargs.get('urls', []) def parse(self, response): print('url:', response.url) # --- it runs without project and saves in `output.csv` --- from scrapy.crawler import CrawlerProcess c = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0', }) c.crawl(MySpider, urls=['http://quotes.toscrape.com']) c.start()
class Scraper: def __init__(self, settings): self.settings = Settings(settings) self.sites = [Site(name, settings) for name in self.settings.names] self._setup_process() def start(self): self.process.start() self._json_to_csv() def _json_to_csv(self): for name in self.settings.names: data_exists = False errors_csv_name = self._file_with_name(name, ext='csv', appendix='_errors') csv_name = self._file_with_name(name, ext='csv') json_name = self._file_with_name(name) try: data = pandas.read_json(json_name) errors = pandas.read_csv(errors_csv_name) data_exists = True except ValueError: pass if data_exists: results = self._postprocess_dataframe(data, errors) results.to_csv(csv_name) def _postprocess_dataframe(self, data, errors): if 'url_item' in data.columns and 'url_search' in data.columns: searches = (data[data['url_item'].isnull()].set_index( 'search_string').dropna(axis='columns', how='all')) items = (data[data['url_search'].isnull()].set_index( 'search_string').dropna(axis='columns', how='all')) results = searches.join(items, how='outer', rsuffix='_delete') return results[[c for c in results.columns if '_delete' not in c]] return data def _setup_process(self): self.now = datetime.now(self.settings.timezone).strftime(TS_FORMAT) self.process = CrawlerProcess(self._crawler_options()) for site in self.sites: self.process.crawl(Spider, settings=site.settings, now=self.now) def _crawler_options(self): """Return crrawlwer options `DOWNLOAD_DELAY` is in seconds, and is such that if `RANDOMIZE_DOWNLOAD_DELAY` is set to `True`, then the requests will happen between 0.5 * `DOWNLOAD_DELAY` and 1.5 * `DOWNLOAD_DELAY`. `RETRY_TIMES` and `RETRY_HTTP_CODES` must be much more flexible if proxies are being used because proxies can fail for a variety of reasons, and we need to be able to adapt to that. """ options = { 'RANDOMIZE_DOWNLOAD_DELAY': True, 'AUTOTHROTTLE_TARGET_CONCURRENCY': 0.1, 'AUTOTHROTTLE_ENABLED': True, 'CONCURRENT_REQUESTS': 2, 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, 'FEED_FORMAT': 'json', 'FEED_URI': self._file_name(), 'COOKIES_ENABLED': False, 'LOG_LEVEL': 'DEBUG', 'RETRY_TIMES': 2, 'DOWNLOAD_DELAY': 5, 'DOWNLOAD_TIMEOUT': 120, 'DOWNLOADER_MIDDLEWARES': { # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, # 'scraper.middlewares.CustomRetriesMiddleware': 550, 'scraper.middlewares.SeleniumMiddleware': 950 }, 'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter' } m = 'DOWNLOADER_MIDDLEWARES' if self.settings.random_proxies: options[m]['scraper.middlewares.ProxiesMiddleware'] = 410 if self.settings.random_user_agents: options[m]['scraper.middlewares.RandomUserAgentsMiddleware'] = 400 if self.settings.mongo: options['ITEM_PIPELINES'] = { 'scraper.pipelines.MongoWriterPipeline': 700 } return options def _file_with_name(self, name, ext='json', appendix=''): return self._file_name(ext, appendix).replace("%(name)s", name) def _file_name(self, ext='json', appendix=''): return "outputs/%(name)s_{}{}.{}".format(self.now, appendix, ext)
"search": "" } yield response.follow( url=self.base_url, method='POST', dont_filter=True, headers=self.headers, body=json.dumps(para), meta={ # "dis_id": dis_id, 'filename': filename # "seo_url": seo_url }, callback=self.parse_page) def parse_page(self, response): data = json.loads(response.body) filename = response.meta["filename"] with open(filename, "a") as f: for item in data["articles"]: f.write(json.dumps(item) + '\n') if __name__ == '__main__': # run scraper process = CrawlerProcess() process.crawl(chothueSpider) process.start()
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from workparser import settings from workparser.spiders.hhru import HhruSpider from workparser.spiders.sjru import SjruSpider if __name__ == '__main__': crawler_settings = Settings() crawler_settings.setmodule(settings) process = CrawlerProcess(settings=crawler_settings) process.crawl(HhruSpider) #process.crawl(SjruSpider) process.start()
for tag in body.select('style'): tag.decompose() text = body.get_text(separator='\n') text = text.replace("\n", " ").replace("\t", " ").replace("\r", " ") return text.lower() web_text = get_text_bs(web_text) exsit_list = checkActivity(act_list, web_text) activities = ', '.join(exsit_list) start_url = ', '.join(self.start_urls) item = {} item['start_url'] = start_url item['activities'] = activities return item process = CrawlerProcess(settings={ "FEEDS": { "data/items_9.json": { "format": "json" }, }, }) process.crawl(ActivitySpider9) process.start()
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from blogparse import settings # from blogparse.spiders.habr_blog import HabrBlogSpider from blogparse.spiders.avito import AvitoSpider if __name__ == '__main__': craw_settings = Settings() craw_settings.setmodule(settings) crawler_proc = CrawlerProcess(settings=craw_settings) # crawler_proc.crawl(HabrBlogSpider) crawler_proc.crawl(AvitoSpider) crawler_proc.start()
#start_urls = [] tags = ['love', 'inspirational', 'life', 'humor', 'books', 'reading'] pages = 3 url_template = 'http://quotes.toscrape.com/tag/{}/page/{}' def start_requests(self): for tag in self.tags: for page in range(self.pages): url = self.url_template.format(tag, page) yield scrapy.Request(url) def parse(self, response): print('url:', response.url) # --- run it without project --- from scrapy.crawler import CrawlerProcess #c = CrawlerProcess({ # 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', # 'FEED_FORMAT': 'csv', # 'FEED_URI': 'data.json', #} c = CrawlerProcess() c.crawl(MySpider) c.start()
from shixiseng.spiders.shixisengspider import ShixisengspiderSpider from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings # 获取settings.py模块的设置 settings = get_project_settings() process = CrawlerProcess(settings=settings) # 可以添加多个spider process.crawl(ShixisengspiderSpider) # 启动爬虫,会阻塞,直到爬取完成 process.start()
import argparse from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings parser = argparse.ArgumentParser() #Argumentos do Script parser.add_argument('-f', '--futbol', action='store_true') parser.add_argument('-bm', '--balonman', action='store_true') parser.add_argument('-cF', '--codigoFederacion', default='none') parser.add_argument('-cC', '--codigoCompeticion', required=True) parser.add_argument('-cG', '--codigoGrupo', default=None) parser.add_argument('-cT', '--codigoTemporada', default='15') parser.add_argument('-cX', '--codigoXornada', default=0) args = parser.parse_args() process = CrawlerProcess({'SPIDER_MODULES': 'tfgObtencionDatos.spiders'}) if args.futbol: #En caso de que se necesiten datos do Futbol federacionsPNFG = ['gal', 'mad', 'ceu', 'rioj', 'clm', 'and', 'cant'] federacionsPNFGBasic = ['arg', 'mur', 'ext'] if args.codigoFederacion in federacionsPNFG: #Scraper PNFG process.crawl('pnfg', federacion=args.codigoFederacion, grupo=args.codigoGrupo, competicion=args.codigoCompeticion, temporada=args.codigoTemporada, xornada=args.codigoXornada) process.start() elif args.codigoFederacion in federacionsPNFGBasic: #Scraper PNFG basic process.crawl('pnfgBasic', federacion=args.codigoFederacion, grupo=args.codigoGrupo,
def parse(self, response): data = json.loads(response.body) for item in data.get('data', []): yield { 'car_id': item.get('id'), 'car_name' : item.get('title'), 'price': item.get('price.value.currency.display'), 'user_id': item.get('user_id') # 'user_name': } metadata = data.get('metadata') if metadata: url = metadata.get('next_page_url') if url: yield scrapy.Request(url) # --- it runs without project and saves in `output.csv` --- from scrapy.crawler import CrawlerProcess c = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0', # save in file as CSV, JSON or XML 'FEED_FORMAT': 'csv', # csv, json, xml 'FEED_URI': 'output.csv', # }) c.crawl(MySpider) c.start()
def run_config(config): config = ConfigLoader(config) CustomDownloaderMiddleware.driver = config.driver DocumentationSpider.NB_INDEXED = 0 if config.use_anchors: from . import scrapy_patch strategy = DefaultStrategy(config) algolia_helper = AlgoliaHelper( config.app_id, config.api_key, config.index_name, AlgoliaSettings.get(config, strategy.levels), config.query_rules, environ.get('REPLACE_DOMAIN', None)) DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_downloader_middleware.CustomDownloaderMiddleware' DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory' DUPEFILTER_CLASS_PATH = 'scraper.src.custom_dupefilter.CustomDupeFilter' if __name__ == '__main__': DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_downloader_middleware.CustomDownloaderMiddleware' DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory' DUPEFILTER_CLASS_PATH = 'src.custom_dupefilter.CustomDupeFilter' process = CrawlerProcess({ 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR', 'USER_AGENT': config.user_agent, 'DOWNLOADER_MIDDLEWARES': { DOWNLOADER_MIDDLEWARES_PATH: 900 }, # Need to be > 600 to be after the redirectMiddleware 'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY, 'DUPEFILTER_USE_ANCHORS': config.use_anchors, # Use our custom dupefilter in order to be scheme agnostic regarding link provided 'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH }) process.crawl(DocumentationSpider, config=config, algolia_helper=algolia_helper, strategy=strategy) process.start() process.stop() # Kill browser if needed BrowserHandler.destroy(config.driver) if len(config.extra_records) > 0: algolia_helper.add_records(config.extra_records, "Extra records") print("") if DocumentationSpider.NB_INDEXED > 0: algolia_helper.commit_tmp_index() print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED)) # config.update_nb_hits(DocumentationSpider.NB_INDEXED) else: print('Crawling issue: nbHits 0 for ' + config.index_name) algolia_helper.report_crawling_issue() print("")
def crawl_run(): scope = 'all' process = CrawlerProcess(settings=get_project_settings()) process.crawl(QuotesSpider, scope) process.start() process.join()
fRelationName = tds[1].get_text() fHouseName = tds[2].get_text() fSerialNo = tds[3].get_text().strip() fLACNo = tds[4].get_text().strip() fPSNo = tds[5].get_text().strip() fIdCardNo = tds[6].select('td > a')[0].get_text().strip() fStatus = tds[7].get_text().strip() fPrimaryIdCardNo = idCardNo self.familyWriter.writerow([ fNameOfElector, fRelationName, fHouseName, fSerialNo, fLACNo, fPSNo, fIdCardNo, fStatus, fPrimaryIdCardNo, addStr ]) except Exception as e: print("[Family Write Error]", e) filename = response.url.split('=') filename = filename[len(filename) - 1] + '.html' with open(filename, 'wb') as f: f.write(response.body) self.log('Saved file %s' % filename) soup = BeautifulSoup(open(filename, encoding="utf-8"), 'lxml') if ('Invalid access to the page' in soup.text): os.remove(filename) if __name__ == "__main__": process = CrawlerProcess() process.crawl(QuotesSpider) process.start()
# coding:utf-8 import scrapy from scrapy.crawler import CrawlerProcess class MySpider1(scrapy.Spider): # Your first spider definition pass class MySpider2(scrapy.Spider): # Your second spider definition pass process = CrawlerProcess() process.crawl(MySpider1) process.crawl(MySpider2) process.start()
results_dict['full_text'] = news_full_text_ext results_dict['link'] = response.url results_dict['tags'] = news_tags_ext results_list.append(results_dict) if __name__ == '__main__': THIS_DIR = os.path.dirname(os.path.abspath(__file__)) filename = 'moneytimes' # List to save the data collected results_list = list() # Initiate a CrawlerProcess process = CrawlerProcess() # Tell the process which spider to use process.crawl(MoneyTimesSpider) # Start the crawling process process.start() # Save the list of dicts with open(os.path.join(THIS_DIR + '/data/results-{}.json'.format(filename)), 'w', encoding='utf8') as f: json.dump(results_list, f, ensure_ascii=False)
for data in datas: yield { 'scrape_date': scrape_date, 'types': types, 'user_pic': user_pic, 'date_update': date_update, 'provinsi': provinsi, 'kabkot': kabkot, 'kecamatan': data["properties"]["name"], 'kelurahan': '', 'alamat': '', 'total_odp': data["properties"]["odp_total"], 'total_pdp': data["properties"]["pdp_total"], 'total_positif': data["properties"]["positif_total"], 'positif_sembuh': data["properties"]["positif_sembuh"], 'positif_dirawat': data["properties"]["positif_dirawat"], 'positif_isolasi': '', 'positif_meninggal': data["properties"]["positif_meninggal"], 'total_otg': '', 'odr_total': '', 'total_pp': '', 'total_ppdt': '', 'source_link': source_link, } if __name__ == "__main__": process = CrawlerProcess() process.crawl(PurbalinggaSpider) process.start()