def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] #create a profile with specific add-ons #and do this. Firefox to load it profile = FirefoxProfile(profile_directory="/Library/Python/2.6/site-packages/selenium/webdriver/firefox") self.selenium = webdriver.Firefox(profile)
def __init__(self, city, plusDate, nightCount): self.city = city self.plusDate = int(plusDate) self.nightCount = nightCount CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, city, num, plusDate): self.city = city self.num = num self.plusDate = int(plusDate) CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self): CrawlSpider.__init__(self) self.verification_errors = [] self.selenium = selenium("localhost", 4444, "*firefox", "http://yue.fm/") self.selenium.start(driver=webdriver.Chrome()) self.count = 0
def __init__(self): CrawlSpider.__init__(self) print "szlibspider start" self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*firefox /usr/lib/firefox/firefox", "http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver = Firefox() self.selenium.start(driver=ffdriver) # self.selenium.start() sel = self.selenium # sel.open("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver.get("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting page to load") #Wait for javscript to load in Selenium # time.sleep(20) # sel.wait_for_condition("condition by js", 20000); # print "ul/li visible? %s" % sel.is_element_present("//ul[@class='servicepointlist']") elements = ffdriver.find_elements_by_xpath("//ul[@class='servicepointlist']/li[@class='item']") for element in elements[:5]: print "%s" % element.find_element_by_class_name("num").text print "%s" % element.find_element_by_class_name("title").text print "%s" % element.find_element_by_class_name("text").text print "---------------"
def __init__(self, xpath_dict={}, files=None): CrawlSpider.__init__(self) self.xpath_dict = xpath_dict self.from_url_file = files self.savingPipe = SavingPipeline() if self.from_url_file: self.crawl_from_files()
def __init__(self, root, date, **kwargs): # super(MySpider, self).__init__(*args, **kwargs) CrawlSpider.__init__(self, **kwargs) domain = get_domain(root) self.scrape_domain = domain self.unix_date = date self.urls_list = [] conn = sqlite3.connect(db_file) c = conn.cursor() insert_row(c, "INSERT INTO scrapes (id, domain, date) VALUES (?, ?, ?)", (None, domain, self.unix_date)) self.scrapeid = c.lastrowid log.msg("scrapeid = {0}".format(self.scrapeid)) conn.commit() conn.close() self.long_count = 0 self.seed = URL( root, self.scrapeid, self.long_count, base={"protocol": "http://", "subdomain": "", "domain": domain, "path": ""}, ) self.start_urls = [self.seed.full] self.allowed_domains = [domain, "facebook.com", "twitter.com"] self.long_count = self.seed.long_count
def __init__(self, fromCity, toCity, plusDate): self.fromCity = fromCity.upper() self.toCity = toCity.upper() self.plusDate = int(plusDate) CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, **kwargs): species = kwargs.pop("species", None) if species is None: raise NotConfigured self.species = species.lower() CrawlSpider.__init__(self, **kwargs) dispatcher.connect(self.spider_closed, signal=signals.spider_closed) self.index = defaultdict(list)
def __init__(self, fromCity, toCity, dateStart, dateEnd): self.fromCity = fromCity self.toCity = toCity self.dateStart = int(dateStart) self.dateEnd = int(dateEnd) CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, xpath_dict={}, files=None): CrawlSpider.__init__(self) self.xpath_dict = xpath_dict self.from_url_file = files self.savingPipe = SavingPipeline() dispatcher.connect(self.spider_closed, signals.spider_closed) if self.from_url_file: self.crawl_from_files()
def __init__(self): CrawlSpider.__init__(self) log.start(logfile="./log/szlib-%s.log" % strftime("%m%d-%H-%M",localtime(time())), loglevel=log.INFO,logstdout=False) log.msg("szlibspider start") print "szlibspider start" self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*firefox /usr/lib/firefox/firefox", "http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver = Firefox() self.selenium.start(driver=ffdriver) # self.selenium.start() sel = self.selenium # sel.open("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") ffdriver.get("http://www.szlib.gov.cn/libraryNetwork/selfLib/id-5.html") WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting page to load") #Wait for javscript to load in Selenium # time.sleep(20) # sel.wait_for_condition("condition by js", 20000); # print "ul/li visible? %s" % sel.is_element_present("//ul[@class='servicepointlist']") elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit district_list']/a") num = "wijefowaeofjwejf SSL0011" selflibs_num = [] for district in elements[1:]: log.msg("%s selflibs:" % district.text) log.msg("==================") district.click() WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load selflibs") selflibs_elements = ffdriver.find_elements_by_xpath("//ul[@class='servicepointlist']/li[@class='item']") for selflib_ele in selflibs_elements: # num = selflib_ele.find_element_by_class_name("num").text num = selflib_ele.find_element_by_class_name("num").get_attribute("textContent") log.msg("num %s" % num) selflibs_num.append(num[-7:]) log.msg("numid %s" % num[-7:] ) log.msg("%s" % selflib_ele.find_element_by_class_name("title").get_attribute("textContent")) log.msg("%s" % selflib_ele.find_element_by_class_name("text").get_attribute("textContent")) log.msg("---------------") log.msg("------1---------") # ffdriver.quit() # numstr = unicode("编号","utf-8") # numstr = unicode(num,"utf-8") # log.msg("numstr is in num? %s" % (numstr in num)) # log.msg("%s,%s, %s" % (num,num[1], num[-7:])) for selflibnum in selflibs_num: selflib_url ="http://www.szlib.gov.cn/libraryNetwork/dispSelfLibBook/id-5/%s.html" % selflibnum log.msg("selflib url %s" % selflib_url) ffdriver.get(selflib_url) WebDriverWait(ffdriver,30).until(ajax_complete, "Timeout waiting to load booklist") categorys_elements = ffdriver.find_elements_by_xpath("//div[@class='boxtext']/div[@class='filterbox_1']/div[@class='text tab_4_tit category']/a") for category_ele in categorys_elements[1:]: log.msg("%s" % category_ele.text)
def __init__(self, url=None, db_name='tags.db', *args, **kwargs): CrawlSpider.__init__(self) #If name was not provided, default to a name self.db_name = db_name #Define space in which spider can crawl #Also define space in which spider begins to crawl self.add_url(url)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] #dispatcher.connect(self.spider_opened, signals.spider_opened) #dispatcher.connect(self.spider_closed, signals.spider_closed) xmlfiles = self.get_xml_files() for xmlfile in xmlfiles: self.start_urls.append(xmlfile)
def __init__(self): CrawlSpider.__init__(self) # inizializzo il baseSpider con il metodo originale (sto riscrivendo il metodo '__init__()') self.verificationErrors = [] # --- Disattivare l'apertura del brawser ------------------------------------- # Funziona soltatno con Linux, per via delle dipendenze grafiche... # self.display = Display(visible=0,backend ='xvnb', size=(800, 600)) # self.display = Display(visible=0, size=(800, 600)) # self.display.start() # ---------------------------------------------------------------------------- self.driver = webdriver.Firefox(self.disableImages()) # carico il webdriver con il profilo che crea la funzione 'disableImages()'
def __init__(self): CrawlSpider.__init__(self) print settings.DATABASE['HOST'] conn = MySQLdb.connect(host = settings.DATABASE['HOST'], user = settings.DATABASE['USER'], \ passwd = settings.DATABASE['PASSWORD'], db = settings.DATABASE['DBNAME'], charset = settings.DATABASE['CHARSET']) cursor = conn.cursor() cursor.execute("SELECT crawled_url FROM des_city") parent_url_list = cursor.fetchall() for url in parent_url_list: #print url[0] self.start_urls.append(url[0] + '/jingdian') for url in self.start_urls: print url
def __init__(self, url, itemSelector, spiderID, spiderName="ScrapySinglePageCrawler", **kwargs): BaseCrawler.__init__(self, [url], spiderName, spiderID, **kwargs) CrawlSpider.__init__(self) self.item_extractor = ItemExtractor(itemSelector, self.item_loader, SpiderTypes.TYPE_SCRAPY, spiderName, self._id) self.url = url self.start_urls = [url]
def __init__(self, *args, **kwargs): ''' Override the default constructor in order to populate the allowed_domains and start_urls lists ''' CrawlSpider.__init__(self, *args, **kwargs) domains = Domain.objects.all() for domain in domains: self.allowed_domains.append(str(domain.domain).replace("http://","").rstrip("/")) self.start_urls.append(str(domain.domain).rstrip("/"))
def __init__(self): self.producer = Producer.objects.get(pk=1) self.brand = Brand.objects.get(pk=1) self.forged_cookie = dict(country="CHIM", SialLocaleDef="CountryCode~CN|WebLang~-7|", SessionPersistence="""CLICKSTREAMCLOUD%3A%3DvisitorId%3Danonymous%7CPROFILEDATA%3A%3D avatar%3D%2Fetc%2Fdesigns%2Fdefault%2Fimages%2Fcollab%2Favatar.png%2CauthorizableId%3D anonymous%2CauthorizableId_xss%3Danonymous%2CformattedName%3D%2CformattedName_xss%3D%7C SURFERINFO%3A%3DIP%3D141.247.239.190%2Ckeywords%3D%2Cbrowser%3DUnresolved%2COS%3DMac%20OS %20X%2Cresolution%3D1440x900%7C""", GUID="415dfb24-e4f2-4218-a5d7-b2943d012103|NULL|1380870456876", cmTPSet="Y") CrawlSpider.__init__(self)
def __init__(self): CrawlSpider.__init__(self) print settings.DATABASE['HOST'] conn = MySQLdb.connect(host = settings.DATABASE['HOST'], user = settings.DATABASE['USER'], \ passwd = settings.DATABASE['PASSWORD'], db = settings.DATABASE['DBNAME'], charset = settings.DATABASE['CHARSET']) cursor = conn.cursor() cursor.execute("SELECT crawled_url FROM des_city") parent_url_list = cursor.fetchall() for url in parent_url_list: #print url[0] self.start_urls.append(url[0]+'/jingdian') for url in self.start_urls: print url
def __init__(self, fromCity, toCity, dateStart, dateEnd, maxTries = 7): (self.fromCity, self.fromValue, self.fromNation, self.fromRegion, self.fromCityCode) = fromCity.split(",") self.fromValue += "#" print self.fromCity, self.fromValue, self.fromNation, self.fromRegion, self.fromCityCode (self.toCity, self.toValue, self.toNation, self.toRegion, self.toCityCode) = toCity.split(",") self.toValue += "#" print self.toCity, self.toValue, self.toNation, self.toRegion, self.toCityCode self.dateStart = int(dateStart) self.dateEnd = int(dateEnd) self.maxTries = maxTries CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self, url = None, db_name = None, filename= None): # Initialising the inherited crawler CrawlSpider.__init__(self) self.db_name = db_name # Initialising the variable if url is not None: self.add_url(url) # Reading input from a file if filename is not None: with open(filename) as f: lines = f.readlines() for line in lines: self.add_url(line)
def __init__(self, url=None, db_name=None, *args, **kwargs): CrawlSpider.__init__(self) #If name was not provided, default to a name if db_name is None: db_name = "contents.db" #Database object self.database = scrapyDatabase(db_name) #Create Content table if it doesn't exist self.database.createContentTable('Content') #Define space in which spider can crawl #Also define space in which spider begins to crawl self.add_url(url)
def __init__(self): CrawlSpider.__init__(self) db = MySQLdb.connect(host = Config.mysqlserver, user = Config.mysqlusername, passwd = Config.mysqlpassword, db = Config.mysqldatabase) cursor = db.cursor() cursor.execute(''' SELECT * FROM foundtypes ''') rows = cursor.fetchall() for row in rows: if len(row) >= 4: self.start_urls.append('http://battle.net%s' % row[4]) cursor.close() db.close()
def __init__(self, baseURL, urlGenerator, itemSelector, spiderID, spiderName="ScrapyPageListCrawler", filterPredicate=None, **kwargs): # get a url from the generator for BaseCrawler to be able to get URL_PARAMS BaseCrawler.__init__(self, [baseURL], spiderName, spiderID, **kwargs) CrawlSpider.__init__(self) self.start_urls = urlGenerator() self.item_extractor = FilteringItemExtractor( itemSelector, self.item_loader, SpiderTypes.TYPE_SCRAPY, self.name, self._id, filterPredicate=filterPredicate)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] #create a profile with specific add-ons #and do this. Firefox to load it ## profile = FirefoxProfile(profile_directory="/home/yourUser/.mozilla/firefox/selenium/") ## self.selenium = webdriver.Firefox(profile) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--disable-extensions') chrome_options.add_argument('--ignore-certificate-errors') self.selenium = webdriver.Chrome(chrome_options=chrome_options, executable_path=r"C:/Users/home/chromedriver.exe")#webdriver.Firefox() self.selenium.get("http://www.python.org") assert "Python" in self.selenium.title elem = self.selenium.find_element_by_name("q") elem.clear() elem.send_keys("pycon") elem.send_keys(Keys.RETURN) assert "No results found." not in self.selenium.page_source
def __init__(self, name=None, **kwargs): log.msg(kwargs, level=log.INFO) self.min_year = kwargs['min_year'] or 1893 self.max_year = kwargs['max_year'] or 1924 log.msg( self.min_year, level=log.INFO) log.msg(self.max_year, level=log.INFO) self.set_banned_years() self.rules += ( Rule(SgmlLinkExtractor(allow=".+devel.+"), follow=False), Rule(SgmlLinkExtractor(allow=self.lower_regex), follow=False), Rule(SgmlLinkExtractor(allow=self.upper_regex), follow=False), Rule(SgmlLinkExtractor(allow=(self.INDEX_REGEX)), callback='parse_indexed_work'), # Si no es ninguna de las anteriores, es una obra y hay que parsearla! Rule(SgmlLinkExtractor(allow=(self.CHAPTER_REGEX)), callback='parse_unindexed_work') ) log.msg(self.rules, level=log.INFO) # Esto va aca abajo PORQUE SCRAPY ASI LO QUIERE (https://groups.google.com/forum/?fromgroups=#!topic/scrapy-users/Z7PjHuBzmA8) CrawlSpider.__init__(self, name)
def __init__(self, fromArea, fromCity, dateStart, dateEnd): self.fromCity = fromCity.lower() self.fromArea = None if fromArea == "europe": self.fromArea = u"歐洲" elif fromArea in ["asia", "china"]: self.fromArea = u"亞洲" elif fromArea == "america": self.fromArea = u"美洲" elif fromArea == "oceania": self.fromArea = u"大洋洲" self.dateStart = int(dateStart) self.dateEnd = int(dateEnd) self.destinationCities = [] self.tickets = [] CrawlSpider.__init__(self) self.startWebDriver()
def __init__(self): CrawlSpider.__init__(self) db = MySQLdb.connect( host=Config.mysqlserver, user=Config.mysqlusername, passwd=Config.mysqlpassword, db=Config.mysqldatabase ) cursor = db.cursor() cursor.execute( """ SELECT * FROM foundtypes """ ) rows = cursor.fetchall() for row in rows: if len(row) >= 4: self.start_urls.append("http://battle.net%s" % row[4]) cursor.close() db.close()
def __init__(self, name=None, **kwargs): log.msg(kwargs, level=log.INFO) self.min_year = kwargs['min_year'] or 1893 self.max_year = kwargs['max_year'] or 1924 log.msg(self.min_year, level=log.INFO) log.msg(self.max_year, level=log.INFO) self.set_banned_years() self.rules += ( Rule(SgmlLinkExtractor(allow=".+devel.+"), follow=False), Rule(SgmlLinkExtractor(allow=self.lower_regex), follow=False), Rule(SgmlLinkExtractor(allow=self.upper_regex), follow=False), Rule(SgmlLinkExtractor(allow=(self.INDEX_REGEX)), callback='parse_indexed_work'), # Si no es ninguna de las anteriores, es una obra y hay que parsearla! Rule(SgmlLinkExtractor(allow=(self.CHAPTER_REGEX)), callback='parse_unindexed_work')) log.msg(self.rules, level=log.INFO) # Esto va aca abajo PORQUE SCRAPY ASI LO QUIERE (https://groups.google.com/forum/?fromgroups=#!topic/scrapy-users/Z7PjHuBzmA8) CrawlSpider.__init__(self, name)
def __init__(self): # init spider self.config.read('./configrations.ini') self.allowed_domains = ["web.archive.org"] self.start_urls = [ self.config.get('target', 'startUrl'), ] self.rules = ( Rule(SgmlLinkExtractor( allow=(r'.*/http://%s/.*' % self.config.get('target', 'domain').replace('.', '\.')), deny_extensions='', # http://www.haogongju.net/art/1690534 tags=('a', 'area', 'link', 'script', 'img'), attrs=('href', 'src'), ), callback='parse_item', follow=True, ), ) # call Crawlspider.__init__ to init a real spider CrawlSpider.__init__(self)
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*firefox", "http://www.jb51.net") self.selenium.start()
def __init__(self, url, allowed_domain): self.start_urls.append(url) self.allowed_domains.append(allowed_domain) CrawlSpider.__init__(self)
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self, *args, **kwargs) self.browser = webdriver.PhantomJS() self.prePauseTime = time.time()
def __init__(self): self.producer = Producer.objects.get(name='Abnova') self.brand = Brand.objects.get(name='Abnova') self.forged_cookie = dict(CookiesAbnovaSelectLanguage="CN") CrawlSpider.__init__(self)
def __init__(self): self.count = 0 CrawlSpider.__init__(self) self.verificationErrors = []
def __init__(self, *args, **kwargs): CrawlSpider.__init__(self, *args, **kwargs)
def __init__(self): self.producer = Producer.objects.get(name="Anbo Biotech") self.brand = Brand.objects.get(name="Anbo Biotech") CrawlSpider.__init__(self)
def __init__(self): CrawlSpider.__init__(self)
def __init__(self): CrawlSpider.__init__(self) self.log = logging.getLogger('scpLogger') self.driver = webdriver.Firefox()
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com") self.selenium.start()
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.azlyrics.com/g/guccimane.html") self.selenium.start()
def __init__(self): CrawlSpider.__init__(self) # use any browser you wish self.browser = webdriver.Firefox()
def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.try.com") self.selenium.start()
def __init__(self): CrawlSpider.__init__(self) self.count = 0 self.MAX_MOVIE = 2000