コード例 #1
0
ファイル: xvfb.py プロジェクト: wangwei0807/cv_crawler
class Xvfb(object):
    def __init__(self, width=1366, height=768, visible=0):
        self.__virtual_display = None
        self.width = width
        self.height = height
        self.visible = visible

    def __init_display(self):
        if self.__virtual_display is None:
            self.__virtual_display = Display(visible=self.visible, size=(self.width, self.height))
            self.__virtual_display.start()

    def __enter__(self):
        self.__init_display()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self._close_display()

    def _close_display(self):
        if self.__virtual_display:
            try:
                self.__virtual_display.close()
            except:
                pass
        self.__virtual_display = None

    @staticmethod
    def run(func, *args, **kwargs):
        runner = Xvfb()
        with runner:
            return func(*args, **kwargs)
コード例 #2
0
class Spider(scrapy.Spider):
    name = "mayors"
    allowed_domains = ["www.cec.gov.tw"]
    start_urls = ["https://www.cec.gov.tw/pc/zh_TW/IDX/indexC.html",]
    download_delay = 1

    def __init__(self, ad=None, *args, **kwargs):
        super(Spider, self).__init__(*args, **kwargs)
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = webdriver.Chrome("/var/chromedriver/chromedriver")

    def spider_closed(self, spider):
        self.display.close()

    def parse(self, response):
        self.driver.get(response.url)
        nodes = scrapy.Selector(text=self.driver.page_source).xpath('//a[@target="_top"]')
        for node in nodes:
            county = node.xpath('text()').extract_first()
            print county
            yield response.follow(node, callback=self.parse_list, meta={'meta': county})

    def parse_list(self, response):
        for tr in response.css(u'table.tableT tr.trT'):
            d = {}
            d['type'] = 'mayors'
            d['county'] = response.meta['meta']
            d['constituency'] = 0
            d['elected'] = tr.xpath('td[1]/text()').extract_first().strip()
            d['number'] = int(tr.xpath('td[2]/text()').extract_first())
            d['votes'] = int(re.sub('\D', '', tr.xpath('td[5]/text()').extract_first()))
            d['votes_percentage'] = tr.xpath('td[6]/text()').extract_first()
            yield d
コード例 #3
0
ファイル: xvfb.py プロジェクト: wangwei0807/cv_crawler
class Xvfb(object):
    def __init__(self, width=1366, height=768, visible=0):
        self.__virtual_display = None
        self.width = width
        self.height = height
        self.visible = visible

    def __init_display(self):
        if self.__virtual_display is None:
            self.__virtual_display = Display(visible=self.visible,
                                             size=(self.width, self.height))
            self.__virtual_display.start()

    def __enter__(self):
        self.__init_display()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self._close_display()

    def _close_display(self):
        if self.__virtual_display:
            try:
                self.__virtual_display.close()
            except:
                pass
        self.__virtual_display = None

    @staticmethod
    def run(func, *args, **kwargs):
        runner = Xvfb()
        with runner:
            return func(*args, **kwargs)
コード例 #4
0
class Spider(scrapy.Spider):
    name = "mayors"
    allowed_domains = ["www.cec.gov.tw"]
    start_urls = [
        "https://www.cec.gov.tw/pc/zh_TW/IDX/indexT.html",
    ]
    download_delay = 1

    def __init__(self, ad=None, *args, **kwargs):
        super(Spider, self).__init__(*args, **kwargs)
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = webdriver.Chrome("/var/chromedriver/chromedriver")

    def spider_closed(self, spider):
        self.display.close()

    def parse(self, response):
        self.driver.get(response.url)
        nodes = scrapy.Selector(
            text=self.driver.page_source).xpath('//a[@target="_top"]')
        for node in nodes:
            constituency = int(
                re.sub('\D', '',
                       node.xpath('text()').extract_first()))
            yield response.follow(node,
                                  callback=self.parse_list,
                                  meta={'meta': constituency})

    def parse_list(self, response):
        for tr in response.css(u'table.tableT tr.trT'):
            d = {}
            d['type'] = 'councilors'
            d['county'] = response.xpath(
                u'//img[@alt="搜尋結果"]/following-sibling::b[1]/text()'
            ).extract_first().split()[0]
            d['constituency'] = response.meta['meta']
            d['elected'] = tr.xpath('td[1]/text()').extract_first().strip()
            d['number'] = int(tr.xpath('td[2]/text()').extract_first())
            d['votes'] = int(
                re.sub('\D', '',
                       tr.xpath('td[5]/text()').extract_first()))
            d['votes_percentage'] = tr.xpath('td[6]/text()').extract_first()
            yield d
コード例 #5
0
class Spider(scrapy.Spider):
    name = "lis_by_ad"
    allowed_domains = ["lis.ly.gov.tw"]
    start_urls = [
        "http://lis.ly.gov.tw/lylgmeetc/lgmeetkm_lgmem",
    ]
    download_delay = 1

    def __init__(self, ad=None, *args, **kwargs):
        super(Spider, self).__init__(*args, **kwargs)
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = webdriver.Chrome("/var/chromedriver/chromedriver")
        self.ad = ad

    def spider_closed(self, spider):
        self.display.close()

    def parse(self, response):
        yield FormRequest.from_response(
            response,
            formdata={
                '_20_8_T':
                str(self.ad).zfill(2),
                'INFO':
                response.xpath('//input[@name="INFO"]/@value').extract_first()
            },
            callback=self.parse_max_per_page)

    def parse_max_per_page(self, response):
        href = response.xpath(
            '//select[@onchange="instback(this)"]/option[re:test(text(), "^\d+$")]/@value'
        ).extract()
        yield Request(urljoin(response.url, href[-1]),
                      callback=self.parse_law_bill_list,
                      dont_filter=True)

    def parse_law_bill_list(self, response):
        self.driver.get(response.url)
        while (True):
            try:
                element = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.ID, "block30")))
            except:
                continue
            sleep(randint(1, 2))
            nodes = Selector(
                text=self.driver.page_source).xpath('//a[@class="link02"]')
            for node in nodes[1::2]:
                href = node.xpath('@href').extract_first()
                yield Request(urljoin(response.url, href),
                              callback=self.parse_law_bill,
                              dont_filter=True)
            try:
                next_page = self.driver.find_element_by_xpath(
                    '//input[@name="_IMG_次頁"]')
                next_page.click()
            except:
                break
        self.driver.close()

    def parse_law_bill(self, response):
        trs = response.xpath('//tr[@class="rectr"]')
        item = {
            tr.xpath('td[1]/nobr/text()').extract_first(): first_or_list(
                tr.xpath('td[1]/nobr/text()').extract_first(),
                tr.xpath('td[2]//text()').extract())
            for tr in trs
        }
        item.pop(u"關係文書", None)  # this one not proper info, parse below
        has_motions = response.xpath(
            u'//img[@src="/lylegis/images/ref4.png"]/parent::a/@href'
        ).extract_first()
        bill_ref_pdf = response.xpath(
            u'//img[@src="/lylgmeet/img/view.png"]/parent::a/@href'
        ).extract_first()
        bill_ref_doc = response.xpath(
            u'//img[@src="/lylgmeet/img/doc_icon.png"]/parent::a/@href'
        ).extract_first()
        if bill_ref_pdf:
            bill_ref = urljoin(
                response.url,
                '/lgcgi/lgmeetimage?%s' % bill_ref_pdf.split('^')[-1])
        elif bill_ref_doc:
            bill_ref = urljoin(response.url, bill_ref_doc)
        else:
            bill_ref = ''
        item['links'] = {
            u'關係文書': bill_ref,
            u'審議進度':
            urljoin(response.url, has_motions) if has_motions else None
        }
        if has_motions:
            yield Request(item['links'][u'審議進度'],
                          callback=self.parse_law_bill_motions,
                          dont_filter=True,
                          meta={'item': item})
        else:
            item['motions'] = []
            yield item

    def parse_law_bill_motions(self, response):
        item = response.request.meta['item']
        motions = []
        for node in response.xpath('//tr[@class="onetr0"]/parent::table'):
            motion = {}
            for tr in node.xpath('.//tr[@class="onetr1"]'):
                motion[tr.xpath(
                    'td[1]/text()').extract_first()] = first_or_list(
                        tr.xpath('td[2]//text()').extract())
            motion.pop(u"影像", None)
            motions.append(motion)
        item['motions'] = motions
        yield item
コード例 #6
0
    def getstatement(self):
        from selenium import webdriver
        from selenium.webdriver.common.keys import Keys
        from selenium.webdriver.common.action_chains import ActionChains
        from pyvirtualdisplay import Display
        from driver_builder import DriverBuilder
        """Setup options for chrome web browser"""
        mn = Main()
        mid_to_search = mn.get_mid_search()
        display = Display(visible=0, size=(800, 600))
        display.start()

        driver_builder = DriverBuilder()
        self.browser = driver_builder.get_driver(dwn, headless=False)
        browser = self.browser

        browser.get("https://www.youraccessone.com")

        username = browser.find_element_by_id("txtUserName")
        password = browser.find_element_by_id("txtPassword")

        username.send_keys(os.environ["username"])
        password.send_keys(os.environ["password"])

        browser.find_element_by_name("uxLogin").click()
        # go to chargebacks
        browser.get("https://www.youraccessone.com/64_rpt_Statements.aspx")

        # open filter
        browser.find_element_by_xpath("//*[text()='FILTER']").click()

        for item in mid_to_search:
            # check my merchant
            print(item)
            for x in range(5):
                try:
                    browser.find_element_by_id(
                        "ctl00_ContentPage_uxFiltering_uxReportFilter_ctl00"
                    ).click()
                    break
                except NoSuchElementException:
                    time.sleep(2)

            time.sleep(2)
            browser.find_element_by_xpath(
                "//*[text()='Merchant Number']").click()
            time.sleep(.5)
            mid = browser.find_element_by_id(
                "ctl00_ContentPage_uxFiltering_uxReportFilter_inMERCHANTNUMBER"
            )
            mid.clear()
            mid.send_keys(item)

            # Click search
            time.sleep(.5)
            try:
                browser.find_element_by_name(
                    "ctl00$ContentPage$uxFiltering$uxReportFilter$btSubmit"
                ).click()
                my_val = browser.find_element_by_name(
                    "ctl00$ContentPage$uxdroplist").get_attribute("value")
                my_val = datetime.strptime(my_val, '%m/%d/%Y')
                now = datetime.now() - relativedelta(months=1)
                print(my_val.month, now.month, my_val.year, now.year)
                if my_val.month == now.month and my_val.year == now.year:
                    browser.find_element_by_name(
                        "ctl00$ContentPage$uxSearch").click()
            except NoSuchElementException:
                continue

        browser.quit()
        try:
            display.close()
        except AttributeError:
            print("The Display is already closed?")
コード例 #7
0
from selenium import webdriver
from pyvirtualdisplay import Display

display = Display(visible=0, size=[800, 600])
display.start()
print("lancement")
driver = webdriver.Firefox()

driver.get("http://www.industrie-expo.com/liste-catalogue-exposants/")

print("")

pageid = 2
while True:
    try:
        driver.execute_script("searchExposant(" + str(pageid) + ", '#')")
        pageid += 1
        print(pageid)
    except:
        break

driver.close()
display.close()
#http://www.marinamele.com/selenium-tutorial-web-scraping-with-selenium-and-python
コード例 #8
0
class Spider(scrapy.Spider):
    name = "lis_by_ad"
    allowed_domains = ["lis.ly.gov.tw"]
    start_urls = [
        "http://lis.ly.gov.tw/lylgmeetc/lgmeetkm_lgmem",
    ]
    download_delay = 1

    def __init__(self, ad=None, *args, **kwargs):
        super(Spider, self).__init__(*args, **kwargs)
        self.display = Display(visible=0, size=(800, 600))
        self.display.start()
        self.driver = webdriver.Chrome("/var/chromedriver/chromedriver")
        self.ad = ad

    def spider_closed(self, spider):
        self.display.close()

    def parse(self, response):
        yield FormRequest.from_response(
            response,
            formdata={
                '_20_8_T': str(self.ad).zfill(2),
                'INFO': response.xpath('//input[@name="INFO"]/@value').extract_first()
            },
            callback=self.parse_max_per_page
        )

    def parse_max_per_page(self, response):
        href = response.xpath('//select[@onchange="instback(this)"]/option[re:test(text(), "^\d+$")]/@value').extract()
        yield Request(urljoin(response.url, href[-1]), callback=self.parse_law_bill_list, dont_filter=True)

    def parse_law_bill_list(self, response):
        self.driver.get(response.url)
        while (True):
            try:
                element = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.ID, "block30"))
                )
            except:
                continue
            sleep(randint(1, 2))
            nodes = Selector(text=self.driver.page_source).xpath('//a[@class="link02"]')
            for node in nodes[1::2]:
                href = node.xpath('@href').extract_first()
                yield Request(urljoin(response.url, href), callback=self.parse_law_bill, dont_filter=True)
            try:
                next_page = self.driver.find_element_by_xpath('//input[@name="_IMG_次頁"]')
                next_page.click()
            except:
                break
        self.driver.close()

    def parse_law_bill(self, response):
        trs = response.xpath('//tr[@class="rectr"]')
        item = {tr.xpath('td[1]/nobr/text()').extract_first(): first_or_list(tr.xpath('td[1]/nobr/text()').extract_first(), tr.xpath('td[2]//text()').extract()) for tr in trs}
        item.pop(u"關係文書", None) # this one not proper info, parse below
        has_motions = response.xpath(u'//img[@src="/lylegis/images/ref4.png"]/parent::a/@href').extract_first()
        bill_ref_pdf = response.xpath(u'//img[@src="/lylgmeet/img/view.png"]/parent::a/@href').extract_first()
        bill_ref_doc = response.xpath(u'//img[@src="/lylgmeet/img/doc_icon.png"]/parent::a/@href').extract_first()
        if bill_ref_pdf:
            bill_ref = urljoin(response.url, '/lgcgi/lgmeetimage?%s' % bill_ref_pdf.split('^')[-1])
        elif bill_ref_doc:
            bill_ref = urljoin(response.url, bill_ref_doc)
        else:
            bill_ref = ''
        item['links'] = {
            u'關係文書': bill_ref,
            u'審議進度': urljoin(response.url, has_motions) if has_motions else None
        }
        if has_motions:
            yield Request(item['links'][u'審議進度'], callback=self.parse_law_bill_motions, dont_filter=True, meta={'item': item})
        else:
            item['motions'] = []
            yield item

    def parse_law_bill_motions(self, response):
        item = response.request.meta['item']
        motions = []
        for node in response.xpath('//tr[@class="onetr0"]/parent::table'):
            motion = {}
            for tr in node.xpath('.//tr[@class="onetr1"]'):
                motion[tr.xpath('td[1]/text()').extract_first()] = first_or_list(tr.xpath('td[2]//text()').extract())
            motion.pop(u"影像", None)
            motions.append(motion)
        item['motions'] = motions
        yield item