def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0')]
def __init__(self): #启动日志 # log.log.defaultObserver = MyObserver() # log.log.defaultObserver.start() # log.started = False log.start() pass
def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 log.start(logfile='log.txt', loglevel=log.WARNING) items = [] sel = Selector(response) base_url = get_base_url(response) catalog = sel.css('div.box_1 div.sp_13').xpath('text()').extract()[0] sites = sel.css('div.centerPadd div.sp_16') for site in sites: item = GuoShuItem() item['siteid'] = self.siteid item['sitename'] = self.sitename item['name'] = site.css('p a').xpath('text()').extract()[0] relative_url = site.css('p a').xpath('@href').extract()[0] item['detailurl'] = urlparse.urljoin( base_url, relative_url) #urljoin_rfc(base_url, relative_url) item['catalog'] = catalog item['guige'] = site.css('.shop').xpath('text()').extract()[0] price = site.css('.shop_s2').xpath('text()').extract() item['price'] = price[0].split('/')[0].replace("¥", "") item['danwei'] = price[0].split('/')[1] items.append(item) # print repr(item).decode("unicode-escape") + '\n' # log.msg('item %s' % repr(item).decode("unicode-escape"),level=log.WARNING) # info('parsed ' + str(response)) return items
def __init__(self, cityid=None, info_log=None): if info_log == None: raise NotConfigured("HotelScrapy类中: 参数info_log不能为空") super(HotelScrapy, self) self.info_log = info_log log.start(logfile=self.info_log, loglevel=log.INFO, logstdout=False) if cityid == None or cityid == "all": citys = CityItem.django_model.objects.all() else: citys = CityItem.django_model.objects.filter(id=cityid) if len(citys) == 0: raise NotConfigured("参数cityid:" + cityid + "不存在于表city_city中,请检查") for city in citys: self.city_entrance_urls.append(city.href) if len(self.city_entrance_urls) > 0: scrapy_item = ScrapyItem() scrapy_item.scrapy_name = self.name if scrapy_item.is_existed_scrapy_name() is False: scrapy_item.save() self.scrapy_batch_item.scrapy_name = self.name self.scrapy_batch_item.batch_number = self.batch_number self.scrapy_batch_item.status = "scrapy_running" self.scrapy_batch_item.save()
def __init__(self): log.start(logfile=time.strftime("log/%Y%m%d%H%M%S")+".log",logstdout=False) log.msg("initiating crawler...",level=log.INFO) self.crawler_id = self.get_crawler_id() log.msg("crawler id is %s" % self.crawler_id,level=log.INFO) self.r.set('crawler:ip:%s' % self.crawler_id,utils.get_external_ip()) self.r.set('crawler:port:%s' % self.crawler_id,settings.REDIS_LOCAL_PORT) self.r.set('crawler:mapping_port:%s' % self.crawler_id,settings.REDIS_LOCAL_MAPPING_PORT) log.msg("crawler ip is %s, port is %d" % (utils.get_external_ip(),settings.REDIS_LOCAL_PORT),level=log.INFO) account = self.get_account() self.username = account[0] self.password = account[1] log.msg("crawler account got",level=log.INFO) self.r_local.set('crawler:status:%s' % self.crawler_id, 'good') self.r_local.set('crawler:update_time:%s' % self.crawler_id, datetime.datetime.utcnow().strftime("%s")) log.msg("local crawler status set",level=log.INFO) heartbeat_thread = threading.Thread(target=self.maintain_local_heartbeat) heartbeat_thread.start() log.msg("local crawler heartbeat started",level=log.INFO) if platform.system() == "Linux": #on linux, use virtual display vdisplay = Xvfb() vdisplay.start() co = ChromeOptions() #TODO: Disable image after log in #TODO: optimize memory usage co.add_experimental_option("prefs",{"profile.default_content_settings":{"popups":1}}) #co.add_experimental_option("prefs",{"profile.default_content_settings":{"popups":1,"images":2,"media":2}}) self.driver = webdriver.Chrome(chrome_options=co) self.driver.set_window_size(640,960)
def call_spider(file): """ Crea el spider y ejecuta el reactor. Copia los resultados del crawling a los archivos .json para luego transformarlos a los archivos data.json correspondientes. """ with open(file, "r") as f: list_url = f.readlines() domains = [] urls = [] created_files = [] for u in list_url: domain = u.strip('\n') url_aux = domain.split("/") domain_type = False if (len(url_aux) > 1): domain = url_aux[0] url = "http://" + url_aux[0] + "/datos/data" if domain == 'www.paraguay.gov.py': url = "http://" + url_aux[0] + "/datos" else: url = "http://" + u.strip('\n') + "/data" domain_type = True print "============= Domain " + domain print "============= Start url " + url response = requests.get(url + "/data.json") if response.status_code == 200: filename = FileController.FileController( ).save_existing_data_json(response, domain, True) created_files.append({ 'modalidad': 'recolecta', 'archivo': filename }) else: domains.append(domain) urls.append(url) spider = DataSpider(domains=domains, start_urls=urls, domain_type=domain_type) settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(logfile="log.txt", loglevel=log.DEBUG, logstdout=False) reactor.run() # the script will block here """ Copiar los datos a los archivos .json """ data_spider.copy_items_to_files() """ Eliminar archivos temporales """ FileController.FileController().clean_tmp_files() """ Convertir los archivos .json a data.json (formato POD) """ for domain in domains: filename = DataJson.DataJson().convert(domain) created_files.append({ 'modalidad': 'data-hunting', 'archivo': filename }) return created_files
def main(): spider = DrugSynonymsSpider() log.start() setup_crawler(spider) reactor.run() items = [] with codecs.open('drug_synonyms.txt', encoding='utf8', mode='r') as file: synonyms = [] for line in file: if line.startswith('synonyms'): line = line[10:] synonyms = line.split('|') elif line.startswith('name'): name = line[6:] temp_list = [name] temp_list.extend(synonyms) items.append(temp_list) d = {} for line in items: for word in line: raw = list(line) raw.remove(word) d[word] = raw with codecs.open('drug_synonyms_dictionary.txt', encoding='utf8', mode='w') as file: for pair in d.items(): s = pair[0].strip() + '|' for word in pair[1]: s += word.strip() + ',' s += '\n' file.write(s)
def __init__(self): log.start(settings.LOG_FILE) try: engine = db_connect() self.Session = sessionmaker(bind=engine) except Exception as e: pass
def __init__(self, category=None, *args, **kwargs): self.driver = webdriver.Firefox() super(SeleniumCrawlerSpider, self).__init__(*args, **kwargs) LOG_FILE = "scrapy_%s_%s.log" % (self.name, "now") # remove the current log # log.log.removeObserver(log.log.theLogPublisher.observers[0]) # re-create the default Twisted observer which Scrapy checks log.log.defaultObserver = log.log.DefaultObserver() # start the default observer so it can be stopped log.log.defaultObserver.start() # trick Scrapy into thinking logging has not started log.started = False # start the new log file observer log.start(LOG_FILE) # continue with the normal spider init #defining the trip "leg" code (Dublin - Liverpool [18] / Liverpool - Dublin [66]) dcode="18" if category == "dublin": dcode="18" elif category == "liverpool": dcode="66" self.start_urls = ['https://ssl.directferries.com/ferry/secure/multi_price_detail.aspx?stdc=DF10&grid=0&rfid=%s&psgr=1&curr=1&retn=True' % dcode] self.log("Init finished")
def parse_careers(spider): crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() spider.start()
def __init__(self, cityid=None, info_log=None): if info_log == None: raise NotConfigured("ReviewScrapy类中: 参数info_log不能为空d") super(ReviewScrapy, self) self.info_log = info_log if cityid == None or cityid == "all": self.cityid = cityid else: city = City.get_city_by_id(cityid) if city == None: raise NotConfigured("参数cityid:" + cityid + "不存在于表city_city中,请检查") else: self.cityid = cityid self.city_name = city.name_ch log.start(logfile=info_log, loglevel=log.INFO, logstdout=False) # store scrapy scrapy_item = ScrapyItem() scrapy_item.scrapy_name = self.name if scrapy_item.is_existed_scrapy_name() is False: scrapy_item.save() # record scrapy status self.scrapy_batch_item.scrapy_name = self.name self.scrapy_batch_item.batch_number = self.batch_number self.scrapy_batch_item.status = "scrapy_running" self.scrapy_batch_item.save()
def __init__(self, region=None, letterIn=None, *args, **kwargs): super(BwPrivateSpider, self).__init__(*args, **kwargs) ### region, letterIn, start_urls self.region = region self.letterIn = letterIn self.start_urls = [self.symbollookup_url + "®ion=" + self.region + "&letterIn=" + self.letterIn + "&firstrow=" + self.start_firstrow] ### setting log file: LOG_ROOT/<spider name>/<region>/<letterIn>/<spider name>-<region>-<letterIn>.log log_path = os.path.join(settings.LOG_ROOT, self.name, self.region, self.letterIn) if not os.path.isdir(log_path): os.makedirs(log_path) log_file = os.path.join(log_path, '-'.join([self.name, self.region, self.letterIn]) + '.log') if os.path.isfile(log_file): os.remove(log_file) print "log file: ", log_file log.start(logfile=log_file, loglevel=log.INFO, logstdout=False) ### setting json data output: DATA_ROOT/<spider name>/<region>/<letterIn>/<spider name>-<region>-<letterIn>.json data_path = os.path.join(settings.DATA_ROOT, self.name, self.region, self.letterIn) if not os.path.isdir(data_path): os.makedirs(data_path) data_file = os.path.join(data_path, '-'.join([self.name, self.region, self.letterIn]) + '.json') if os.path.isfile(data_file): os.remove(data_file) print "data file: ", data_file self.data_file = data_file
def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 parse_item log.start(logfile='log.txt', loglevel=log.WARNING) items = [] sel = Selector(response) base_url = get_base_url(response) catalog = sel.css('div.cc').xpath('text()').extract()[2] catalog = catalog[catalog.index(u'品牌:'):].replace("\r\n", "").replace( "品牌:", "").lstrip().rstrip() item = GuoShuItem() item['siteid'] = self.siteid item['sitename'] = self.sitename item['name'] = sel.css('div.cc h2').xpath('text()').extract()[0] item['detailurl'] = base_url item['catalog'] = catalog item['guige'] = sel.css('div.cc b').xpath('text()').extract()[0] price = sel.css('div.cc').xpath( './/font[@color="red"]/text()').extract()[0] item['price'] = price item['danwei'] = item['guige'] items.append(item) # print repr(item).decode("unicode-escape") + '\n' # log.msg('item %s' % repr(item).decode("unicode-escape"),level=log.WARNING) # info('parsed ' + str(response)) return items
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u"" dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name="test", domain="testdomain") crawler.crawl(spider) crawler.start() log.start(loglevel="ERROR") reactor.run() # I suspect web actions may be broken... assert webdriver.get.mock_calls == [ call("http://testdomain/path?wr=0"), call("http://testdomain/path?wr=0&wa=0"), call("http://testdomain/path?wr=0&wa=1"), call("http://testdomain/path?wr=1"), call("http://testdomain/path?wr=1&wa=0"), call("http://testdomain/path?wr=1&wa=1"), # call('http://testdomain/path?wr=0&wa=0&wr=0'), call("http://testdomain/path?wr=0&wa=1&wr=0"), call("http://testdomain/path?wr=0&wa=1&wr=0"), # call('http://testdomain/path?wr=1&wa=0&wr=0'), call("http://testdomain/path?wr=1&wa=1&wr=0"), call("http://testdomain/path?wr=1&wa=1&wr=0"), ]
def __init__(self, category=None, *args, **kwargs): self.driver = webdriver.Firefox() super(SeleniumCrawlerSpider, self).__init__(*args, **kwargs) LOG_FILE = "scrapy_%s_%s.log" % (self.name, "now") # remove the current log # log.log.removeObserver(log.log.theLogPublisher.observers[0]) # re-create the default Twisted observer which Scrapy checks log.log.defaultObserver = log.log.DefaultObserver() # start the default observer so it can be stopped log.log.defaultObserver.start() # trick Scrapy into thinking logging has not started log.started = False # start the new log file observer log.start(LOG_FILE) # continue with the normal spider init #defining the trip "leg" code (Dublin - Liverpool [18] / Liverpool - Dublin [66]) dcode = "18" if category == "dublin": dcode = "18" elif category == "liverpool": dcode = "66" self.start_urls = [ 'https://ssl.directferries.com/ferry/secure/multi_price_detail.aspx?stdc=DF10&grid=0&rfid=%s&psgr=1&curr=1&retn=True' % dcode ] self.log("Init finished")
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() # I suspect web actions may be broken... assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), #call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), #call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0') ]
def runspider(): date = datetime.datetime.utcnow() unix_date = calendar.timegm(date.utctimetuple()) route = request.args.get('route') domain = request.args.get('domain') directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date) if not os.path.exists(directory): os.makedirs(directory) logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() log.start(loglevel=logging.DEBUG) dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = MySpider(route, unix_date) settings_module = importlib.import_module('SiteCrawler.settings') settings = CrawlerSettings(settings_module) crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start() log.msg('Running reactor...') reactor.run() # the script will block here until the spider is closed log.msg('Reactor stopped.') return redirect(url_for('choose_graph', domain = domain, date = unix_date))
def setup_crawler( spider_class, **kwargs ): """ Use scrapy in a script see http://doc.scrapy.org/en/latest/topics/practices.html :param spider_class: Spider class to test :type spider_class: text """ def add_item(item): items.append(item) items = [] # create Crawler settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # connect collecting function on item_passed crawler.signals.connect(add_item, signals.item_passed) # create & connect spider spider = spider_class(**kwargs) crawler.crawl(spider) # start crawler log.start() crawler.start() # run crawler task.deferLater(reactor, 1, reactor.stop) reactor.run() return items
def run_spider(spider, settings): from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) log.start() crawler.start()
def spider_setup(): spider=Lily_bbs() crawler=Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def crawl(self, queue_object): ch, method, properties, body = yield queue_object.get() if body: spider = pickle.load(body) t = CrawlerProcess(spider) log.start() t.setup() yield ch.basic_ack(delivery_tag=method.delivery_tag)
def crawl(): spider = StackserviceSpider() crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run() # the script will block here
def __init__(self): log.start('logfile') self.conn = sqlite3.connect('russia.db') self.c = self.conn.cursor() query = ''' CREATE TABLE IF NOT EXISTS kremlin(id INTEGER PRIMARY KEY, title TEXT, body TEXT, keywords TEXT, post_date DATE, link TEXT) ''' self.c.execute(query)
def __init__(self,mailer=None): super(HemaSpider,self).__init__() log.start('d:/3.log', log.WARNING,logstdout=True) self.userid_pa = re.compile('uid-(\d+)') self.reply_pattern = re.compile('(\d+).*?(\d+)') self.post_time_pa = re.compile('</a>.*?(\d+-\d+-\d+).*?(\d+:\d+)',re.S) self.mail=mailer self.site_id=33
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) log.start() crawler.start()
def crawl(spider_class): log.start() spider = spider_class() crawler = Crawler(CrawlerSettings(scrapy_settings_module)) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor.run()
def _crawl( self, queue, search ): log.start( loglevel = log.DEBUG ) current_spider = CraigslistSpider() if search: current_spider.set_search_url( search ) self.crawler.crawl( current_spider ) self.crawler.start() self.crawler.stop() queue.put( current_spider.get_object_list() )
def __init__(self): CrawlSpider.__init__(self) log.start(logfile="./log/szlib-%s.log" % strftime("%m%d-%H-%M",localtime(time())), loglevel=log.INFO,logstdout=False) log.msg("szlibspider start") print "szlibspider start" self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*firefox /usr/lib/firefox/firefox", "http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver = Firefox() self.selenium.start(driver=ffdriver) # self.selenium.start() sel = self.selenium # sel.open("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver.get("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting page to load") #Wait for javscript to load in Selenium # time.sleep(20) # sel.wait_for_condition("condition by js", 20000); # print "ul/li visible? %s" % sel.is_element_present("//ul[@class='servicepointlist']") elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit district_list']/a") num = "wijefowaeofjwejf SSL0011" selflibs_num = [] for district in elements[1:]: log.msg("%s selflibs:" % district.text) log.msg("==================") district.click() WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load selflibs") selflibs_elements = ffdriver.find_elements_by_xpath("//ul[@class='servicepointlist']/li[@class='item']") for selflib_ele in selflibs_elements: # num = selflib_ele.find_element_by_class_name("num").text num = selflib_ele.find_element_by_class_name("num").get_attribute("textContent") log.msg("num %s" % num) selflibs_num.append(num[-7:]) log.msg("numid %s" % num[-7:] ) log.msg("%s" % selflib_ele.find_element_by_class_name("title").get_attribute("textContent")) log.msg("%s" % selflib_ele.find_element_by_class_name("text").get_attribute("textContent")) log.msg("---------------") log.msg("------1---------") # ffdriver.quit() # numstr = unicode("编号","utf-8") # numstr = unicode(num,"utf-8") # log.msg("numstr is in num? %s" % (numstr in num)) # log.msg("%s,%s, %s" % (num,num[1], num[-7:])) for selflibnum in selflibs_num: selflib_url ="http://www.szlib.gov.cn/libraryNetwork/dispSelfLibBook/id-5/%s.html" % selflibnum log.msg("selflib url %s" % selflib_url) ffdriver.get(selflib_url) WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load booklist") categorys_elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit category']/a") for category_ele in categorys_elements[1:]: log.msg("%s" % category_ele.text)
def runSpider(self): dispatcher.connect(SpiderRunner.stop_reactor, signal=signals.spider_closed) crawler = self.__crawler crawler.crawl(self.__spider) crawler.start() log.start() log.msg("Starting spider...") reactor.run() log.msg("Stopped spider.")
def sequentialCrawling(): config = Configuration() spiders = getSpiderObjects(config) settings = get_project_settings() for spider in spiders: args = [spider, settings] runSpider(args) log.start() reactor.run() # the script will block here until the spider_closed signal was sent
def __init__(self): log.start() self.conn = MySQLdb.connect(user=settings.MYSQL_DATABASE['user'], \ passwd=settings.MYSQL_DATABASE['passwd'], \ db=settings.MYSQL_DATABASE['db'], \ host=settings.MYSQL_DATABASE['host'], \ charset=settings.MYSQL_DATABASE['charset'], \ use_unicode=settings.MYSQL_DATABASE['use_unicode']) self.cursor = self.conn.cursor()
def run(self): crawler = Crawler(get_project_settings()) crawler.configure() log.start() for spiderName in crawler.spiders.list(): self.spiderCounter += 1 self.setupCrawler(spiderName) reactor.run()
def get_more_entropy(): spider = TruenetSpider(domain='truenet.co.nz') settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def startspider(name): spider = tbs1(name) crawler = Crawler(get_project_settings()) crawler.signals.connect(reactor.stop,signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run() print crawler
def handle(self, *args, **options): spider = HHSearchResultsSpider() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run() # the script will block here until the spider_closed signal was sent
def run_login_spider(seed_url, username, password, db_name, logfile = "results.log"): init_db(db_name) settings = get_project_settings() runner = CrawlerRunner(settings) d = runner.crawl(LoginFinderSpider, seed_url = seed_url, username = username, password = password) d.addBoth(lambda _: reactor.stop()) log.start(loglevel=log.DEBUG, logfile=logfile) log.msg("Item pipelines enabled: %s" % str(settings.get("ITEM_PIPELINES")), level = log.INFO) reactor.run()
def do_scrape(spider_name): """ Asynchronous task for individual scrapes that is executed by Celery workers. :param spider_name: str name of the spider that should be run :return: the full path of the jsonlines output file to which results are stored """ # create and configure the spider crawl_settings = get_project_settings() # configure the output # Technically don't need this unless we actually do the scrape, but need to put # up here before the crawler is instantiated so the FEED_URI override is active output_name = generate_scrape_name(spider_name) output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name) crawl_settings.overrides['FEED_URI'] = output_path crawler = Crawler(crawl_settings) crawler.configure() try: spider = crawler.spiders.create(spider_name) except KeyError as e: # No spider found. raise RuntimeError( 'Could not find spider with name {}'.format(spider_name)) # Check to see if we're already running a scrape by looking for open ScrapeJobs is_scraping = is_spider_scraping(spider_name) if is_scraping is False: logger.info('Starting new scrape of {}'.format(spider_name)) # Create the ScrapeJob record job_id = do_scrape.request.id if job_id is None: # Case if called directly without using Celery, put in a dummy job id timestamp = datetime.now().strftime('%y%m%d%H%M') job_id = 'MANUAL_RUN{}'.format(timestamp) job = ScrapeJob.objects.create( spider=spider_name, scheduled=datetime.now(), # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task job_id=job_id, raw_response=output_path) # and set up the callback for updating it complete_cb = complete_job(job.id) # Connect the signals and logging, then start it up crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.signals.connect(complete_cb, signal=signals.spider_closed) log.start(loglevel=log.INFO, logstdout=True) crawler.crawl(spider) crawler.start() reactor.run() else: logger.info('Pending job found for spider {}'.format(spider_name)) job = is_scraping return job.raw_response
def setup_crawler(keyword): print 'schedule run script is running.........' spider = BaiduSpider(keyword=keyword) settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(loglevel=log.DEBUG) reactor.run()
def setup_crawler(keywords): spider = BaiduSpider(keywords=keywords) settings = get_project_settings() crawler = Crawler(settings) # stop reactor when spider closes crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(loglevel=log.DEBUG) reactor.run()
def run_main(): log.start() InitLog() settings = get_project_settings() crawler = Crawler(settings) spider = JobKeySpider.from_crawler(crawler) crawler.signals.connect(reactor.stop, signal = signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start(close_if_idle = True) reactor.run() # the script will block here until the spider_closed signal was sent
def crawla(self): #dispatcher.connect(reactor.stop(), signal=signals.spider_closed) spider = Titlespider() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def run(): spider = thSpider(domain='cn-proxy.com') settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run( ) # the script will block here until the spider_closed signal was sent
def _cron_kaohsiung(): dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = KaohsiungSpider() crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() log.msg('Running reactor...') reactor.run() # the script will block here until the spider is closed log.msg('Reactor stopped.')
def __init__(self, info_log=None): if info_log == None: raise NotConfigured("CityScrapy类中: 参数info_log不能为空") super(CityScrapy, self) log.start(logfile=info_log, loglevel=log.INFO, logstdout=False) scrapy_item = ScrapyItem() scrapy_item.scrapy_name = self.name if scrapy_item.is_existed_scrapy_name() is False: scrapy_item.save()
def setup_crawler(): # spider = FollowAllSpider(domain=domain) spider = zackSpider() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def get_connected_devices(ip_address, password, loglevel="WARNING"): spider = ConnectedDeviceSpider(ip_address, password) collector = ItemCollector() crawler = Crawler(Settings()) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.signals.connect(collector.add_item, signals.item_passed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(loglevel=loglevel, logstdout=False) reactor.run() # the script will block here return collector.items
def run_spider(spider, settings): from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(spider) log.start() # start engine scrapy/twisted crawler.start()
def run_spider(origin='', destination='', departure_date='', return_date=''): spider = SkyScannerOriginDestinationSpider( origin=origin, destination=destination, departure_date=compact_date(departure_date), return_date=compact_date(return_date), ) crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def main(): import sys sys.path.append("/home/scriptrunner/") spider = SexoffSpider(county='ORANGE') settings = get_project_settings() settings.set('ITEM_PIPELINES', {'scraper.StoreItemsPipeline': 1000}) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def parse_item(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.jianshu.com/p/b851e04de659 @returns items 1 16 @scrapes author content title url datetime wordnum views_count comments_count likes_count followers_count total_likes_count rank """ item = JianshuItem() log.start(logfile='log.txt', loglevel=log.INFO) log.msg('RequestURL:%s' % response.url, spider=JSSpider) contents = response.xpath('//div[contains(@class, "preview")]')[0] item['title'] = contents.xpath( 'h1[contains(@class,"title")]/text()').extract()[0] item['author'] = contents.xpath( 'div/a[contains(@class,"author-name")]/span/text()').extract()[0] item['datetime'] = contents.xpath( 'div[contains(@class,"author-info")]/span/text()').extract()[1] pagecons = response.xpath('//div[contains(@class, "show-content")]/p') item['content'] = pagecons.extract() item['url'] = response.url scriptlists = response.xpath( '//script[contains(@data-name,"note")]/text()').extract() scriptlist6 = scriptlists[0].strip().split(',')[-6:] newscripts = [] for script in scriptlist6: newscripts += script.encode('utf8').split(':') newscript = [n.replace('"', '') for n in newscripts] newdict = dict(newscript[i:i + 2] for i in range(0, len(newscript), 2)) item['wordnum'] = newdict.get('wordage') item['views_count'] = newdict.get('views_count') item['likes_count'] = newdict.get('likes_count') item['comments_count'] = newdict.get('comments_count') followersandtotallikes = response.xpath( '//script[contains(@data-name,"author")]/text()').extract() followersandtotallikes2 = followersandtotallikes[0].strip().split( ',')[-3:-1] newfollowersandtotallikes2 = [] for followersandlikes in followersandtotallikes2: newfollowersandtotallikes2 += followersandlikes.encode( 'utf8').split(':') followerslikes = [ n.replace('"', '') for n in newfollowersandtotallikes2 ] followerslikesdict = dict(followerslikes[i:i + 2] for i in range(0, len(followerslikes), 2)) item['followers_count'] = followerslikesdict.get('followers_count') item['total_likes_count'] = followerslikesdict.get('total_likes_count') return item
def sitecrawl(request): if 'q' in request.GET: q = request.GET['q'] spider = testspider(domain='q') crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run() return render(request, 'sitescrawl.html') else: return render(request, 'sitescrawl.html')
def crawl_resident_advisor(): global spider_count spider_count = 0 crawler = Crawler(Settings()) crawler.configure() crawler.crawl(linkedin()) crawler.start() log.start() log.msg('Running in reactor...') reactor.run() # the script will block here log.msg('Reactor stopped.')
def run(self): dispatcher.connect(self.stop_reactor, signal=signals.spider_closed) spider = PriceSpider(self.str) testset = Settings() testset.set("ITEM_PIPELINES",{ 'tutorial.pipelines.MySQLStorePipeline': 1 }) crawler = Crawler(testset) crawler.configure() crawler.crawl(spider) crawler.start() log.start() log.msg('Running reactor...') reactor.run(installSignalHandlers=0) # the script will block here until the spider is closed log.msg('Reactor stopped.')