Beispiel #1
0
 def parse(self, response):
     endpoints = Selector(response).xpath("//td/font/a")
     endpoints.pop(len(endpoints) - 1)
     items = []
     for endpoint in endpoints:
         item = UfoScraperItem()
         item["title"] = endpoint.xpath("text()").extract()[0]
         item["link"] = "http://www.nuforc.org/webreports/" + endpoint.xpath("@href").extract()[0]
         items.append(item)
         yield Request(item["link"], callback=self.parse_page1)
Beispiel #2
0
 def parse(self, response):
     filename = response.url.split("//")[1].split("/")[0] + '.txt'
     html = Selector(response=response).xpath('//a[re:test(@class, "list_item")]').extract()
     f = open(filename, 'wb')
     for line in html:
         if line.strip() == '':
             continue
         href = Selector(text=line).xpath('//a//@href').extract()
         title = Selector(text=line).xpath('//h2[re:test(@class, "title")]/text()').extract()
         img = Selector(text=line).xpath('//img[re:test(@class, "img")]//@src').extract()
         f.write(title.pop() + '\n')
         f.write(href.pop() + '\n')
         f.write(img.pop() + '\n')
         f.write('\n')
         f.write('\n')
     f.close()
Beispiel #3
0
 def parse(self, response):
     sel = Selector(response)
     try:
         page_json = sel.re(weibos_re)[0]
     except IndexError:
         raise LoginFailed()
     page_html = json.loads(page_json).get('html')
     if not page_html:
         raise IgnoreRequest()
     page_urls = Selector(text=page_html).xpath(
         './/a[contains(@suda-data,"key=tblog_search_weibo&value=weibo_page'
         '")]/@href'
     ).extract()
     page_urls.pop(-1)
     page_urls.append(self.search_url.format(1))
     for href in page_urls:
         url = ''.join([self.url_prefix, href])
         yield Request(url=url,
                       meta={'cookiejar': 1},
                       cookies=self.cookies,
                       callback=self.parse_weibo)
 def one_search(self, driver, search_formula, pubulished_date_from,
                pubulished_date_to):
     #         search_formula = u'软件学报'
     #         pubulished_date_from = '2012-1-1'
     #         pubulished_date_to = '2015-1-1'
     print search_formula
     driver.find_element_by_name('magazine_value1').send_keys(
         search_formula.decode('utf-8'))
     driver.find_element_by_name('publishdate_from').send_keys(
         pubulished_date_from)
     driver.find_element_by_name('publishdate_to').send_keys(
         pubulished_date_to)
     driver.find_element_by_xpath('//*[@id="btnSearch"]').click()
     time.sleep(10)
     flag = True
     page_num = 1
     driver.switch_to_frame('iframeResult')
     fp = open('linksSet.txt', 'wb+')
     while flag:
         #save href links
         page = driver.page_source
         #print driver.page_source
         hrefs = Selector(text=page).xpath(
             '//a[contains(@class,"fz14")]/@href').extract()
         print hrefs
         while hrefs:
             #print processing_links
             href = hrefs.pop()
             real_href = 'http://www.cnki.net' + href.replace('kns', 'KCMS')
             fp.write(str(real_href) + "\r\n")
             print '----------------------------'
             print hrefs
             if hrefs:
                 continue
             else:
                 try:
                     next_page = driver.find_element_by_xpath(
                         '//*[@id="Page_next"]')
                     next_page.click()
                     page_num += 1
                     time.sleep(5)
                     break
                 except NoSuchElementException:
                     print "last page has reached,total nums of page is %d" % page_num
                     flag = False