class Xvfb(object): def __init__(self, width=1366, height=768, visible=0): self.__virtual_display = None self.width = width self.height = height self.visible = visible def __init_display(self): if self.__virtual_display is None: self.__virtual_display = Display(visible=self.visible, size=(self.width, self.height)) self.__virtual_display.start() def __enter__(self): self.__init_display() def __exit__(self, exc_type, exc_val, exc_tb): self._close_display() def _close_display(self): if self.__virtual_display: try: self.__virtual_display.close() except: pass self.__virtual_display = None @staticmethod def run(func, *args, **kwargs): runner = Xvfb() with runner: return func(*args, **kwargs)
class Spider(scrapy.Spider): name = "mayors" allowed_domains = ["www.cec.gov.tw"] start_urls = ["https://www.cec.gov.tw/pc/zh_TW/IDX/indexC.html",] download_delay = 1 def __init__(self, ad=None, *args, **kwargs): super(Spider, self).__init__(*args, **kwargs) self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome("/var/chromedriver/chromedriver") def spider_closed(self, spider): self.display.close() def parse(self, response): self.driver.get(response.url) nodes = scrapy.Selector(text=self.driver.page_source).xpath('//a[@target="_top"]') for node in nodes: county = node.xpath('text()').extract_first() print county yield response.follow(node, callback=self.parse_list, meta={'meta': county}) def parse_list(self, response): for tr in response.css(u'table.tableT tr.trT'): d = {} d['type'] = 'mayors' d['county'] = response.meta['meta'] d['constituency'] = 0 d['elected'] = tr.xpath('td[1]/text()').extract_first().strip() d['number'] = int(tr.xpath('td[2]/text()').extract_first()) d['votes'] = int(re.sub('\D', '', tr.xpath('td[5]/text()').extract_first())) d['votes_percentage'] = tr.xpath('td[6]/text()').extract_first() yield d
class Spider(scrapy.Spider): name = "mayors" allowed_domains = ["www.cec.gov.tw"] start_urls = [ "https://www.cec.gov.tw/pc/zh_TW/IDX/indexT.html", ] download_delay = 1 def __init__(self, ad=None, *args, **kwargs): super(Spider, self).__init__(*args, **kwargs) self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome("/var/chromedriver/chromedriver") def spider_closed(self, spider): self.display.close() def parse(self, response): self.driver.get(response.url) nodes = scrapy.Selector( text=self.driver.page_source).xpath('//a[@target="_top"]') for node in nodes: constituency = int( re.sub('\D', '', node.xpath('text()').extract_first())) yield response.follow(node, callback=self.parse_list, meta={'meta': constituency}) def parse_list(self, response): for tr in response.css(u'table.tableT tr.trT'): d = {} d['type'] = 'councilors' d['county'] = response.xpath( u'//img[@alt="搜尋結果"]/following-sibling::b[1]/text()' ).extract_first().split()[0] d['constituency'] = response.meta['meta'] d['elected'] = tr.xpath('td[1]/text()').extract_first().strip() d['number'] = int(tr.xpath('td[2]/text()').extract_first()) d['votes'] = int( re.sub('\D', '', tr.xpath('td[5]/text()').extract_first())) d['votes_percentage'] = tr.xpath('td[6]/text()').extract_first() yield d
class Spider(scrapy.Spider): name = "lis_by_ad" allowed_domains = ["lis.ly.gov.tw"] start_urls = [ "http://lis.ly.gov.tw/lylgmeetc/lgmeetkm_lgmem", ] download_delay = 1 def __init__(self, ad=None, *args, **kwargs): super(Spider, self).__init__(*args, **kwargs) self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome("/var/chromedriver/chromedriver") self.ad = ad def spider_closed(self, spider): self.display.close() def parse(self, response): yield FormRequest.from_response( response, formdata={ '_20_8_T': str(self.ad).zfill(2), 'INFO': response.xpath('//input[@name="INFO"]/@value').extract_first() }, callback=self.parse_max_per_page) def parse_max_per_page(self, response): href = response.xpath( '//select[@onchange="instback(this)"]/option[re:test(text(), "^\d+$")]/@value' ).extract() yield Request(urljoin(response.url, href[-1]), callback=self.parse_law_bill_list, dont_filter=True) def parse_law_bill_list(self, response): self.driver.get(response.url) while (True): try: element = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.ID, "block30"))) except: continue sleep(randint(1, 2)) nodes = Selector( text=self.driver.page_source).xpath('//a[@class="link02"]') for node in nodes[1::2]: href = node.xpath('@href').extract_first() yield Request(urljoin(response.url, href), callback=self.parse_law_bill, dont_filter=True) try: next_page = self.driver.find_element_by_xpath( '//input[@name="_IMG_次頁"]') next_page.click() except: break self.driver.close() def parse_law_bill(self, response): trs = response.xpath('//tr[@class="rectr"]') item = { tr.xpath('td[1]/nobr/text()').extract_first(): first_or_list( tr.xpath('td[1]/nobr/text()').extract_first(), tr.xpath('td[2]//text()').extract()) for tr in trs } item.pop(u"關係文書", None) # this one not proper info, parse below has_motions = response.xpath( u'//img[@src="/lylegis/images/ref4.png"]/parent::a/@href' ).extract_first() bill_ref_pdf = response.xpath( u'//img[@src="/lylgmeet/img/view.png"]/parent::a/@href' ).extract_first() bill_ref_doc = response.xpath( u'//img[@src="/lylgmeet/img/doc_icon.png"]/parent::a/@href' ).extract_first() if bill_ref_pdf: bill_ref = urljoin( response.url, '/lgcgi/lgmeetimage?%s' % bill_ref_pdf.split('^')[-1]) elif bill_ref_doc: bill_ref = urljoin(response.url, bill_ref_doc) else: bill_ref = '' item['links'] = { u'關係文書': bill_ref, u'審議進度': urljoin(response.url, has_motions) if has_motions else None } if has_motions: yield Request(item['links'][u'審議進度'], callback=self.parse_law_bill_motions, dont_filter=True, meta={'item': item}) else: item['motions'] = [] yield item def parse_law_bill_motions(self, response): item = response.request.meta['item'] motions = [] for node in response.xpath('//tr[@class="onetr0"]/parent::table'): motion = {} for tr in node.xpath('.//tr[@class="onetr1"]'): motion[tr.xpath( 'td[1]/text()').extract_first()] = first_or_list( tr.xpath('td[2]//text()').extract()) motion.pop(u"影像", None) motions.append(motion) item['motions'] = motions yield item
def getstatement(self): from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains from pyvirtualdisplay import Display from driver_builder import DriverBuilder """Setup options for chrome web browser""" mn = Main() mid_to_search = mn.get_mid_search() display = Display(visible=0, size=(800, 600)) display.start() driver_builder = DriverBuilder() self.browser = driver_builder.get_driver(dwn, headless=False) browser = self.browser browser.get("https://www.youraccessone.com") username = browser.find_element_by_id("txtUserName") password = browser.find_element_by_id("txtPassword") username.send_keys(os.environ["username"]) password.send_keys(os.environ["password"]) browser.find_element_by_name("uxLogin").click() # go to chargebacks browser.get("https://www.youraccessone.com/64_rpt_Statements.aspx") # open filter browser.find_element_by_xpath("//*[text()='FILTER']").click() for item in mid_to_search: # check my merchant print(item) for x in range(5): try: browser.find_element_by_id( "ctl00_ContentPage_uxFiltering_uxReportFilter_ctl00" ).click() break except NoSuchElementException: time.sleep(2) time.sleep(2) browser.find_element_by_xpath( "//*[text()='Merchant Number']").click() time.sleep(.5) mid = browser.find_element_by_id( "ctl00_ContentPage_uxFiltering_uxReportFilter_inMERCHANTNUMBER" ) mid.clear() mid.send_keys(item) # Click search time.sleep(.5) try: browser.find_element_by_name( "ctl00$ContentPage$uxFiltering$uxReportFilter$btSubmit" ).click() my_val = browser.find_element_by_name( "ctl00$ContentPage$uxdroplist").get_attribute("value") my_val = datetime.strptime(my_val, '%m/%d/%Y') now = datetime.now() - relativedelta(months=1) print(my_val.month, now.month, my_val.year, now.year) if my_val.month == now.month and my_val.year == now.year: browser.find_element_by_name( "ctl00$ContentPage$uxSearch").click() except NoSuchElementException: continue browser.quit() try: display.close() except AttributeError: print("The Display is already closed?")
from selenium import webdriver from pyvirtualdisplay import Display display = Display(visible=0, size=[800, 600]) display.start() print("lancement") driver = webdriver.Firefox() driver.get("http://www.industrie-expo.com/liste-catalogue-exposants/") print("") pageid = 2 while True: try: driver.execute_script("searchExposant(" + str(pageid) + ", '#')") pageid += 1 print(pageid) except: break driver.close() display.close() #http://www.marinamele.com/selenium-tutorial-web-scraping-with-selenium-and-python
class Spider(scrapy.Spider): name = "lis_by_ad" allowed_domains = ["lis.ly.gov.tw"] start_urls = [ "http://lis.ly.gov.tw/lylgmeetc/lgmeetkm_lgmem", ] download_delay = 1 def __init__(self, ad=None, *args, **kwargs): super(Spider, self).__init__(*args, **kwargs) self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome("/var/chromedriver/chromedriver") self.ad = ad def spider_closed(self, spider): self.display.close() def parse(self, response): yield FormRequest.from_response( response, formdata={ '_20_8_T': str(self.ad).zfill(2), 'INFO': response.xpath('//input[@name="INFO"]/@value').extract_first() }, callback=self.parse_max_per_page ) def parse_max_per_page(self, response): href = response.xpath('//select[@onchange="instback(this)"]/option[re:test(text(), "^\d+$")]/@value').extract() yield Request(urljoin(response.url, href[-1]), callback=self.parse_law_bill_list, dont_filter=True) def parse_law_bill_list(self, response): self.driver.get(response.url) while (True): try: element = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.ID, "block30")) ) except: continue sleep(randint(1, 2)) nodes = Selector(text=self.driver.page_source).xpath('//a[@class="link02"]') for node in nodes[1::2]: href = node.xpath('@href').extract_first() yield Request(urljoin(response.url, href), callback=self.parse_law_bill, dont_filter=True) try: next_page = self.driver.find_element_by_xpath('//input[@name="_IMG_次頁"]') next_page.click() except: break self.driver.close() def parse_law_bill(self, response): trs = response.xpath('//tr[@class="rectr"]') item = {tr.xpath('td[1]/nobr/text()').extract_first(): first_or_list(tr.xpath('td[1]/nobr/text()').extract_first(), tr.xpath('td[2]//text()').extract()) for tr in trs} item.pop(u"關係文書", None) # this one not proper info, parse below has_motions = response.xpath(u'//img[@src="/lylegis/images/ref4.png"]/parent::a/@href').extract_first() bill_ref_pdf = response.xpath(u'//img[@src="/lylgmeet/img/view.png"]/parent::a/@href').extract_first() bill_ref_doc = response.xpath(u'//img[@src="/lylgmeet/img/doc_icon.png"]/parent::a/@href').extract_first() if bill_ref_pdf: bill_ref = urljoin(response.url, '/lgcgi/lgmeetimage?%s' % bill_ref_pdf.split('^')[-1]) elif bill_ref_doc: bill_ref = urljoin(response.url, bill_ref_doc) else: bill_ref = '' item['links'] = { u'關係文書': bill_ref, u'審議進度': urljoin(response.url, has_motions) if has_motions else None } if has_motions: yield Request(item['links'][u'審議進度'], callback=self.parse_law_bill_motions, dont_filter=True, meta={'item': item}) else: item['motions'] = [] yield item def parse_law_bill_motions(self, response): item = response.request.meta['item'] motions = [] for node in response.xpath('//tr[@class="onetr0"]/parent::table'): motion = {} for tr in node.xpath('.//tr[@class="onetr1"]'): motion[tr.xpath('td[1]/text()').extract_first()] = first_or_list(tr.xpath('td[2]//text()').extract()) motion.pop(u"影像", None) motions.append(motion) item['motions'] = motions yield item