コード例 #1
0
    def crawlListPage(self):
        print '开始抓取列表页'
        self.openPage(
            "http://hotel.elong.com/nanjing/"
        )
        # 记录每页的循环次数(初始值为0)
        loop_num = 0
        # 标识页面是否已经爬取:False为未处理,反之为已处理
        if_handle = False

        # 总页面数
        page_num = 0
        hotel_num = int(self.driver.find_element_by_xpath("//span[@class='t24 mr5']").text)
        if hotel_num % 20==0:
            page_num = hotel_num/20
        else:
            page_num = hotel_num/20 + 1

        # 测试 抓取5页
        #page_num = 5

        while page_num>=1:
            loop_num += 1
            self.driver.find_element_by_tag_name("body").send_keys(Keys.END)
            #self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_UP)
            if u"返后价" in self.driver.page_source:
                if if_handle == False:
                    self.__parseUrls(self.driver.page_source)
                    print u"获取酒店数为:%d" % len(self.listPageInfo)
                    if_handle = True
                try:
                    #判断是否在加载,若在加载,就等0.1s
                    response = HtmlResponse(url="My HTML String",body=self.driver.page_source,encoding="utf-8")
                    _loading = response.xpath("//div[@id='_loading_']/@style").extract()
                    while 1:
                        if _loading == []:
                            break
                        if u'none' in _loading[0]:
                            break
                        else:
                            #print '正在加载中......'
                            time.sleep(0.1)
                            response = HtmlResponse(url="My HTML String",body=self.driver.page_source,encoding="utf-8")
                            _loading = response.xpath("//div[@id='_loading_']/@style").extract()
                    if u"下一页" in self.driver.page_source:
                        self.driver.find_element_by_xpath("//div[@class='paging1']/a[@class='page_next']").click()
                        page_num -= 1
                        if_handle = False
                        loop_num = 0
                        time.sleep(random.uniform(1, 3))
                except Exception, e:
                    print "error happen at clicking next-page"
                    print e

            if loop_num != 0:
                if loop_num < 15:
                    time.sleep(1)
                    continue
                else:
                    break
コード例 #2
0
ファイル: linksys.py プロジェクト: MikimotoH/scraper
    def parse_kb(self, response):
        mib = None

        # need to perform some nasty segmentation because different firmware versions are not clearly separated
        # reverse order to get MIB before firmware items
        for entry in reversed(response.xpath(
                "//div[@id='support-article-downloads']/div/p")):
            for segment in reversed(entry.extract().split("<br><br>")):
                resp = HtmlResponse(
                    url=response.url, body=segment, encoding=response.encoding)
                for href in resp.xpath("//a/@href").extract():
                    text = resp.xpath("//text()").extract()

                    if "MIBs" in href:
                        mib = href

                    elif "firmware" in href:
                        text = resp.xpath("//text()").extract()

                        item = FirmwareLoader(
                            item=FirmwareImage(), response=resp, date_fmt=["%m/%d/%Y"])
                        item.add_value("date", item.find_date(text))
                        item.add_xpath("url", "//a/@href")
                        item.add_value("mib", mib)
                        item.add_value("product", response.meta["product"])
                        item.add_value("vendor", self.name)
                        item.add_value(
                            "version", FirmwareLoader.find_version_period(text))
                        yield item.load_item()
コード例 #3
0
ファイル: belkin.py プロジェクト: MikimotoH/scraper
    def parse_kb(self, response):
        # initial html tokenization to find regions segmented by e.g. "======"
        # or "------"
        filtered = response.xpath(
            "//div[@class='sfdc_richtext']").extract()[0].split("=-")

        for entry in [x and x.strip() for x in filtered]:
            resp = HtmlResponse(url=response.url, body=entry,
                                encoding=response.encoding)

            for link in resp.xpath("//a"):
                href = link.xpath("@href").extract()[0]
                if "cache-www" in href:
                    text = resp.xpath("//text()").extract()
                    text_next = link.xpath("following::text()").extract()

                    item = FirmwareLoader(item=FirmwareImage(),
                                          response=response,
                                          date_fmt=["%b %d, %Y", "%B %d, %Y",
                                                    "%m/%d/%Y"])

                    version = FirmwareLoader.find_version_period(text_next)
                    if not version:
                        version = FirmwareLoader.find_version_period(text)

                    item.add_value("version", version)
                    item.add_value("date", item.find_date(text))
                    item.add_value("url", href)
                    item.add_value("product", response.meta["product"])
                    item.add_value("vendor", self.name)
                    yield item.load_item()
コード例 #4
0
 def __crawlHotelComment(self,driver,hotel_id ,pagenum):
     pagenum = int(pagenum)
     # 遍历所有页
     while pagenum>=1:
         response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8")
         loading = response.xpath("//div[@id='commentLoading']/@style").extract()[0]
         #当加载不显示时,才爬取
         while loading!=u'display: none;':
             print '正在加载......'
             time.sleep(0.1)
             response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8")
             loading = response.xpath("//div[@id='commentLoading']/@style").extract()[0]
         itemlist =  response.xpath("//ul[@class='dcomt_list']/li")
         for item in itemlist:
             username = item.xpath(".//div[@class='dcomt_head left']/div[2]/span/text()").extract()[0]
             remarkText = item.xpath(".//p[@class='dcomt_con_txt']/text()").extract()[0]
             #TODO 过滤 非中文字符 待修改
             remarkText = remarkText.encode("gbk",'ignore')
             remarkText = remarkText.decode("gbk")
             remark = ''
             for string in remarkText:
                 remark = remark + re.sub("\s+", "", string)
             user_type = item.xpath(".//div[@class='dcomt_head_pic']/p/text()").extract()[0]
             comm_time = item.xpath(".//span[@class='dcomt_con_time']/text()").extract()[0]
             goodorbad = item.xpath(".//p[@class='mb5']/i/@class").extract()[0]
             comm_type = ''
             if u'good' in  goodorbad:
                 comm_type = "值得推荐"
             if u'bad' in goodorbad:
                 comm_type = "有待改善"
             senti_value = self.hotelNLP.sentiment(remark.encode("utf-8"))
             viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8"))
             comm ={
                 "guid":uuid.uuid1(),
                 "username":username,
                 "remark":remark,
                 "comm_time":comm_time,
                 "user_type":user_type,
                 "comm_type":comm_type,
                 "senti_value":senti_value,
                 "viewpoint":viewpoint,
                 "baseinfo_id":hotel_id
             }
             if self.__is_exist_in_comment_list(comm) is False:
                 self.commList.append(comm)
             else:
                 #print comm['remark']
                 pass
         if pagenum == 1:
             break
         #点下一页
         self.scroll_and_click_by_xpath("//div[@id='comment_paging']/a[@class='page_next']")
         pagenum  -= 1
         time.sleep(random.uniform(1,4))
         print pagenum
     return True
コード例 #5
0
 def __parseUrls(self, page_source):
     response = HtmlResponse(url="my HTML string",body=page_source,encoding="utf-8")
     # 抽取出每页中的酒店url存储到urlList中
     urlList = response.xpath("//a[@class='name']/@href").extract()
     commnumList = response.xpath("//div[@class='comment']/a/span/text()").extract()
     name_list = response.xpath("//a[@class='name']/text()").extract()
     if len(urlList) == len(commnumList) == len(name_list):
         for i in range(0,len(urlList)):
             self.listPageInfo.append({
                 "guid":uuid.uuid1(),
                 "url":urlList[i],
                 "hotel_name":name_list[i],
                 "OTA":"途牛",
                 "comm_num":int(commnumList[i]),
             })
コード例 #6
0
    def parse_reviews(
        self, response: HtmlResponse, product_id: int
    ) -> Iterable[Item]:
        reviews: List[HtmlResponse] = response.xpath(self.REVIEWS_LIST_XPATH)

        for review in reviews:
            rating: int = len(
                review.xpath(self.RATING_SELECTED_STARS_XPATH).getall()
            )
            time: str = review.xpath(self.TIMESTAMP_XPATH).get("")
            timestamp: float = (
                mktime(
                    datetime.strptime(time, self.TIMESTAMP_FORMAT).timetuple()
                )
                if time
                else 0.0
            )
            text: str = review.xpath(self.TEXT_XPATH).get("")
            size: str = review.xpath(self.SIZE_XPATH).get(": ").split(": ")[-1]
            color: str = (
                review.xpath(self.COLOR_XPATH).get(": ").split(": ")[-1]
            )

            yield ReviewItem(
                product_id=product_id,
                rating=rating,
                timestamp=timestamp,
                text=text,
                size=size,
                color=color,
            )
コード例 #7
0
ファイル: collection.py プロジェクト: TomaseLiu/myzhihu
    def parse(self, response):
        print "test point"
        response = HtmlResponse(url=response.url,
                                status=response.status,
                                headers=response.headers,
                                body=response.body)
        url = response.url
        #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract()

        #        active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div/h2/a/text()').extract()
        active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div')
        file_obj = open('collection_now.log', 'w')

        for active_block in active_page_list:
            #active = active_block.xpath('.//div[1]/text()').extract()[1].strip()
            #question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract()
            #answer_link = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract()[0]

            #if 'http' not in answer_link:
            #    answer_link = "http://www.zhihu.com" + answer_link
            question = active_block.xpath('.//h2/a/text()').extract()[0]
            #            answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a[@href="toggle-expand"]/@href').extract()
            answer_link = active_block.xpath(
                './/div/div[1]/div[4]/div/a/@href').extract()
            if len(answer_link) > 0:
                if 'http' not in answer_link[0]:
                    answer_link_str = "http://www.zhihu.com" + answer_link[0]
#                print question, answer_link_str
                file_obj.write(
                    question.encode('utf-8') + '\t' +
                    answer_link_str.encode('utf-8') + '\n')

#            file_obj.write('\n')

        file_obj.close()
コード例 #8
0
 def vacansy_parce(self, response: HtmlResponse):
     link = response.url
     name = response.xpath("//h1/text()").extract_first()
     salary = response.xpath(
         "//span[@class='_3mfro _2Wp8I ZON4b PlM3e _2JVkc']/text()"
     ).extract()
     company_name = response.xpath(
         "//h2[@class='_3mfro PlM3e _2JVkc _2VHxz _3LJqf _15msI']/text()"
     ).extract()
     company_address = response.xpath(
         "//span[@class='_3mfro _1hP6a _2JVkc']/text()").extract_first()
     yield JobparserItem(name=name,
                         salary=salary,
                         company_name=company_name,
                         company_address=company_address,
                         link=link)
コード例 #9
0
ファイル: book24ru.py プロジェクト: nnnedelkina/DataParsing
 def parse(self, response: HtmlResponse):
     next_page = response.xpath("//a[text()='Далее']/@href").extract_first()
     if next_page:
         yield response.follow(next_page, callback=self.parse)
     book_links = response.css('a.book__image-link::attr(href)').extract()
     for link in book_links:
         yield response.follow(link, callback=self.book_parse)
コード例 #10
0
ファイル: sjru.py プロジェクト: fdima/ScrapyAndParsing
 def parse_vacancy(response: HtmlResponse):
     item = {
         'script':
         response.xpath(
             '//script[@type="application/ld+json"]/text()').extract()
     }
     yield SuperjobItem(**item)
コード例 #11
0
ファイル: collection.py プロジェクト: TomaseLiu/myzhihu
    def parse(self, response):
        print "test point"
        response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body)
        url = response.url
        #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract()
        
#        active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div/h2/a/text()').extract()
        active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div')
        file_obj = open('collection_now.log', 'w')

        for active_block in active_page_list:
            #active = active_block.xpath('.//div[1]/text()').extract()[1].strip()
            #question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract()
            #answer_link = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract()[0]
            
            #if 'http' not in answer_link:
            #    answer_link = "http://www.zhihu.com" + answer_link
            question = active_block.xpath('.//h2/a/text()').extract()[0]
#            answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a[@href="toggle-expand"]/@href').extract()
            answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a/@href').extract()
            if len(answer_link) > 0:
                if 'http' not in answer_link[0]:
                    answer_link_str = "http://www.zhihu.com" + answer_link[0]
#                print question, answer_link_str
                file_obj.write(question.encode('utf-8') + '\t' + answer_link_str.encode('utf-8') + '\n')

#            file_obj.write('\n')

        file_obj.close()
コード例 #12
0
    def start(self):

        self.driver.get(
            'https://wipo.taleo.net/careersection/wp_2/jobsearch.ftl?lang=en#')
        self.driver.maximize_window()
        self.driver.implicitly_wait(30)
        time.sleep(3)

        if 'Job' in self.driver.page_source:
            response = HtmlResponse(url="my HTML string",
                                    body=self.driver.page_source,
                                    encoding="utf-8")
            links = response.xpath(
                '//div[@class="multiline-data-container"]/div/span/a/@href'
            ).extract()
            logger.info("WIPO共" + str(len(links)) + "条网页待爬")
            items = []
            for link in links:
                logger.debug("WIPO待爬岗位:  " + "https://wipo.taleo.net" + link)
                url = 'https://wipo.taleo.net' + link
                self.driver.get(url)
                time.sleep(3)
                item = self._parse(self.driver.page_source, url)
                if item not in items:
                    logger.debug("页面%s爬取成功" % url)
                    items.append(item)

            logger.debug("共爬取WIPO岗位数据%d条" % len(items))
            saveToCsv = SaveToCsv()
            saveToCsv.saveWIPOjobs(WIPOPath, items)
        else:
            self.start()
コード例 #13
0
    def vacansy_parse(self, response: HtmlResponse):
        name_vac = response.css('h1::text').extract_first()
        salary_vac = response.xpath("//span[@class='_3mfro _2Wp8I PlM3e _2JVkc']/text()").extract()
        url_vac = response.url
        source_vac = 'superjob.ru'

        yield JobparserItem(name=name_vac, salary=salary_vac, url=url_vac, source=source_vac)
コード例 #14
0
 def parse(self, response: HtmlResponse):
     last_page = response.xpath(
         "//div[contains(@data-marker,'pagination-button')]/span[contains(@data-marker,"
         "'page')][last()]/text()").extract_first()
     items = response.xpath(
         "//div[@itemtype='http://schema.org/Product'] //a[@data-marker='item-title']/@href"
     ).extract()
     if int(last_page) > 1:
         count_page = int(last_page)
         while count_page > 1:
             page_url = f'https://www.avito.ru/chelyabinsk/tovary_dlya_kompyutera?cd=2&p={count_page}'
             count_page -= 1
             yield response.follow(page_url, callback=self.parse)
     for item in items:
         item_link = f'https://www.avito.ru/{item}'
         yield response.follow(item_link, callback=self.item_pars)
コード例 #15
0
 def parse_2(self, response):
     cadena_temp_1 = response.body.split("<TABLE  CELLSPACING=1>")
     cadena_temp_1 = cadena_temp_1[1].split("</TABLE>")
     cadena_temp_1[0] = ('<HTML><BODY><TABLE  CELLSPACING=1>' +
                         cadena_temp_1[0] +
                         '</TABLE></BODY></HTML>').lower()
     response = HtmlResponse(url=response.url, body=cadena_temp_1[0])
     #pprint.pprint("++++++++++++++++++++++++++++++")
     for registro in response.xpath('.//body/table/tbody/tr'):
         item = Crawler_2Item()
         if not registro.xpath('td[1]/font/a/text()').extract():
             item["date_text"] = ""
         else:
             item["date_text"] = registro.xpath(
                 'td[1]/font/a/text()').extract()[0]
         if not registro.xpath('td[1]/font/a/@href').extract():
             item["date_href"] = ""
         else:
             item["date_href"] = registro.xpath(
                 'td[1]/font/a/@href').extract()[0]
         if not registro.xpath('td[2]/font/text()').extract():
             item["city"] = ""
         else:
             item["city"] = registro.xpath('td[2]/font/text()').extract()[0]
         if not registro.xpath('td[3]/font/text()').extract():
             item["state"] = ""
         else:
             item["state"] = registro.xpath(
                 'td[3]/font/text()').extract()[0]
         if not registro.xpath('td[4]/font/text()').extract():
             item["shape"] = ""
         else:
             item["shape"] = registro.xpath(
                 'td[4]/font/text()').extract()[0]
         if not registro.xpath('td[5]/font/text()').extract():
             item["duration"] = ""
         else:
             item["duration"] = registro.xpath(
                 'td[5]/font/text()').extract()[0]
         if not registro.xpath('td[6]/font/text()').extract():
             item["summary"] = ""
         else:
             item["summary"] = registro.xpath(
                 'td[6]/font/text()').extract()[0]
         if not registro.xpath('td[7]/font/text()').extract():
             item["posted"] = ""
         else:
             item["posted"] = registro.xpath(
                 'td[7]/font/text()').extract()[0]
         #pprint.pprint(item_tabla_2)
         url_nuevo = 'http://nuforc.org/webreports/' + item["date_href"]
         item["detalle1"] = ""
         item["detalle2"] = ""
         yield scrapy.Request(
             url_nuevo,
             body="",
             method='GET',
             headers={"content-type": "application/x-www-form-urlencoded"},
             dont_filter=True,
             callback=lambda r: self.parse_3(r, item))
コード例 #16
0
 def __parseUrls(self, page_source):
     response = HtmlResponse(url="My HTML String",
                             body=page_source,
                             encoding="utf-8")
     hotel_list = response.xpath(
         "//div[@class='h_list']/div[@class='h_item']")
     for hotel in hotel_list:
         url = hotel.xpath(".//p[@class='h_info_b1']/a/@href").extract()[0]
         name = hotel.xpath(
             ".//p[@class='h_info_b1']/a/@title").extract()[0]
         address = hotel.xpath(
             ".//p[@class='h_info_b2']/text()").extract()[1]
         commnum = hotel.xpath(
             ".//div[@class='h_info_comt']/a/span[@class='c555 block mt5']/b/text()"
         ).extract()
         if len(commnum) == 0:
             commnum = 0
         else:
             commnum = commnum[0]
         self.listPageInfo.append({
             "guid": uuid.uuid1(),
             "url": url,
             "hotel_name": name,
             "OTA": self.__ota_info,
             "comm_num": commnum,
             "address": address
         })
         pass
コード例 #17
0
 def parse(self, response):
     #pprint.pprint("------------------------------")
     cadena_temp_1 = response.body.split("<TABLE  CELLSPACING=1>")
     cadena_temp_1 = cadena_temp_1[1].split("</TABLE>")
     cadena_temp_1[0] = '<HTML><BODY><TABLE  CELLSPACING=1>'.lower(
     ) + cadena_temp_1[0].lower() + '</TABLE></BODY></HTML>'.lower()
     response_2 = HtmlResponse(
         url="http://nuforc.org/webreports/ndxevent.html",
         body=cadena_temp_1[0])
     for registro in response_2.xpath('.//body/table/tbody/tr'):
         item_tabla = CrawlerUfoItem()
         item_tabla['report_href'] = registro.xpath(
             'td[1]/font/a/@href').extract()[0]
         item_tabla['report_text'] = registro.xpath(
             'td[1]/font/a/text()').extract()[0]
         item_tabla['count'] = registro.xpath(
             'td[2]/font/text()').extract()[0]
         #pprint.pprint(item_tabla)
         url_nuevo = 'http://nuforc.org/webreports/' + item_tabla[
             'report_href']
         yield scrapy.Request(
             url_nuevo,
             body="",
             method='GET',
             headers={"content-type": "application/x-www-form-urlencoded"},
             callback=self.parse_2,
             dont_filter=True)
コード例 #18
0
    def advert_parse(self, response: HtmlResponse):
        title = response.xpath(
            '//span[contains(@class, "title-info-title-text")]/text()'
        ).extract_first()

        price = response.xpath(
            '//span[contains(@class, "js-item-price")]/text()').extract_first(
            )

        attrs = response.xpath(
            '//li[contains(@class, "item-params-list-item")]').extract()
        clean_attrs = []
        for attr in attrs:
            clean_attrs.append(self.remove_html_tags(attr))

        yield AvitoparseItem(title=title, price=price, attrs=clean_attrs)
コード例 #19
0
 def __crawllianjie(self, page_sourse):
     response = HtmlResponse(url="my HTML string",
                             body=page_sourse,
                             encoding="utf-8")
     hotel_list = response.xpath("//div[@class='searchresult_list ']/ul")
     for hotel in hotel_list:
         url = hotel.xpath(
             "li[@class='searchresult_info_name']/h2/a/@href").extract()[0]
         address = hotel.xpath(
             "li[@class='searchresult_info_name']/p[@class='searchresult_htladdress']/text()"
         ).extract()[0]
         commnum = hotel.xpath(
             "li[@class='searchresult_info_judge ']/div/a/span[@class='hotel_judgement']/text()"
         ).extract()
         if len(commnum):
             commnum = re.sub('\D', '', commnum[0])
             commnum = commnum if len(commnum) > 0 else 0
         else:
             commnum = 0
         name = hotel.xpath(
             "li[@class='searchresult_info_name']/h2/a/text()").extract()[0]
         self.listPageInfo.append({
             "guid": uuid.uuid1(),
             "url": url,
             "hotel_name": name,
             "OTA": self.__ota_info,
             "comm_num": int(commnum),
             "address": address
         })
コード例 #20
0
ファイル: glassdoor.py プロジェクト: aayushijha27/scripts
 def parse(self, response):
     f = open('glassdoor_northwest.csv', 'a')
     writer = csv.writer(f)
     text = response.xpath('//div[@class="hreview"]').extract()
     #user = []
     for items in text:
         items = HtmlResponse(url="my html string", body=items, encoding='utf-8')
         date = items.xpath('//time[@class="date subtle small"]/text()').extract()
         author = items.xpath('//span[@class = "authorJobTitle reviewer"]/text()').extract()
         location = items.xpath('//span[@class = "authorLocation"]/text()').extract()
         work_exp = items.xpath('//p[@class = " tightBot mainText"]/text()').extract()
         pros = items.xpath('//p[@class = " pros mainText truncateThis wrapToggleStr"]/text()').extract()
         cons = items.xpath('//p[@class = " cons mainText truncateThis wrapToggleStr"]/text()').extract()
         string = str(pros) + " " + str(cons)
         #user.append([date, author, location, work_exp, pros, cons])
         writer.writerow([str(date).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0',''), str(author).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0',''), str(location).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0',''), str(work_exp).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0',''), str(string).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0','')])
コード例 #21
0
ファイル: spider.py プロジェクト: kuangklq/pc
def spider():
    options = Options()
    options.add_argument('-headless')
    driver = webdriver.Chrome(options=options)    
    # 以上三行设置 Chrome 浏览器无头模式,可以有效提高程序运行速度
    # 测试阶段可以注释掉上面三行,使用下面这一行启动谷歌驱动,打开浏览器
    # driver = webdriver.Chrome()
    url = 'https://www.shiyanlou.com/courses/427'
    driver.get(url)                # 打开待爬取页面
    result = []
    while True:
        driver.implicitly_wait(3)  # 隐式等待 3 秒
        html = driver.page_source
        response = HtmlResponse(url=url, body=html.encode())
        for comment in response.css('div.comment-item'):
            d = {
                'username': comment.css('a.name::text').extract_first().strip(),
                'content': comment.css('div.content::text').extract_first(
                    ).strip()
            }
            result.append(d)
        # 如果第二个 li 标签 class 属性值包含 disalbed 字段,表示没有下一页了
        if 'disabled' in response.xpath('(//li[contains'
            '(@class, "page-item")])[2]/@class').extract_first():
            break
        # 定位到第二个 li 标签,也就是“下一页”那个按钮
        ac = driver.find_element_by_xpath(
            '(//li[contains(@class, "page-item")])[2]')
        # chromedirver 无法自动定位到当前页面未显示区域,下面这行代码起到定位作用
        ActionChains(driver).move_to_element(ac).perform()
        time.sleep(1)  # 等待按钮加载
        ac.click()     # 点击下一页按钮
    driver.quit()
    with open('comments.json', 'w') as f:
        json.dump(result, f)
コード例 #22
0
ファイル: hh.py プロジェクト: e-razdumina/Data_collection
 def vacancy_parse(self, response: HtmlResponse):
     name = response.css('div.vacancy-title h1::text').extract_first()
     salary = response.xpath(
         "//p[@class='vacancy-salary']/span/text()").extract()
     link = response.url
     # print(name,salary)
     yield JobparserItem(name=name, salary=salary, link=link)
コード例 #23
0
 def book_parse(self, response: HtmlResponse):
     name = response.css("h1::text").extract_first()
     author = response.xpath(
         "//div[@class='item-tab__chars-item']//span[contains(text(),'Автор')]//..//span//a//text()"
     ).extract()
     main_price = response.css(
         "div.item-actions__price-old::text").extract_first()
     discount_price = response.xpath(
         "//div[@class='item-actions__price']//b//text()").extract_first()
     rating = 0  #response.xpath("//div[@id='rate']//text()").extract_first()
     yield JobparserItem(name=name,
                         href=response.url,
                         author=author,
                         main_price=main_price,
                         discount_price=discount_price,
                         rating=rating)
コード例 #24
0
    def set_secure_headers(self, html_url):
        """设置authorization、x-guest-token"""
        home_page_content = requests.get(url=html_url,
                                         headers=self.common_headers).text
        x_guest_token = re.search('decodeURIComponent\("gt=(.*?);',
                                  home_page_content, re.S).group(1)

        # 2.找出其中的js
        home_page_response = HtmlResponse(url=html_url,
                                          body=home_page_content,
                                          encoding='utf-8')
        token_js_url = home_page_response.xpath(
            '//link[@rel="preload"][last()]/@href').extract_first()
        js_content = requests.get(url=token_js_url,
                                  headers=self.common_headers).text
        authorization_code = re.search('a="Web-12",s="(.*?)"', js_content,
                                       re.S).group(1)

        headers = {
            'Connection': 'keep-alive',
            'authorization': 'Bearer {}'.format(authorization_code),
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
            'x-guest-token': x_guest_token,
            'Accept': '*/*',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        self.request_headers.update(headers)
コード例 #25
0
    def vacansy_parse(self, response: HtmlResponse):
        name_vac = response.css('h1::text').extract_first()
        salary_vac = response.xpath(
            "//span[@class='bloko-header-2 bloko-header-2_lite']/text()"
        ).extract()

        yield JobparserItem(name=name_vac, salary=salary_vac)
コード例 #26
0
 def parse(self, response: HtmlResponse):
     ads_links = response.xpath(
         '//a[@class="item-description-title-link"]/@href| '
         '//a[@class="description-title-link js-item-link"]/@href').extract(
         )
     for link in ads_links:
         yield response.follow(link, self.parse_ads)
コード例 #27
0
ファイル: hhru.py プロジェクト: wangalex1/Scrapy
 def vacancy_parse(self, response: HtmlResponse):
     link = response.url
     name = response.xpath('//h1[@class=\'header\']//span/text()').extract_first()
     salary = response.css('div.vacancy-title p.vacancy-salary::text').extract()
     salary = self.format_salary(salary)
     print(link, name, salary)
     yield JobparserItem(name=name, salary=salary, link=link, site="hh.ru")
コード例 #28
0
ファイル: hhru.py プロジェクト: Snusmi/GB_parcing_methods
 def vacansy_parce(self, response: HtmlResponse):
     link = response.url
     name = response.xpath("//h1/text()").extract_first()
     salary = response.xpath(
         "//p[@class='vacancy-salary']/span/text()").extract()
     company_name = response.xpath(
         "//a[@data-qa='vacancy-company-name']/span/text() | //a[@data-qa='vacancy-company-name']/span/span/text()"
     ).extract()
     company_address = response.xpath(
         "//p[@data-qa='vacancy-view-location']/text() | //p[@data-qa='vacancy-view-location']/span/text()"
     ).extract()
     yield JobparserItem(name=name,
                         salary=salary,
                         company_name=company_name,
                         company_address=company_address,
                         link=link)
コード例 #29
0
ファイル: people.py プロジェクト: TomaseLiu/myzhihu
    def parse(self, response):
        print "test point"
        response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body)
        url = response.url
        #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract()
        
        #active_page_list = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a[@class="question_link" or @class="post-link"]/text()').extract()
        active_page_list = response.xpath('//*[@id="zh-profile-activity-page-list"]/div')

        file_obj = open('active_now.log', 'w')

        for active_block in active_page_list:
            active = active_block.xpath('.//div[1]/text()').extract()[1].strip()
            question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract()
            answer_link_list = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract()
            answer_link = ""
            if len(answer_link_list) > 0:
                answer_link = answer_link_list[0]
            
            question_txt = ""
            if len(question) > 0:
                question_txt = question[0] 
            if 'http' not in answer_link:
                answer_link = "http://www.zhihu.com" + answer_link
            file_obj.write(active.encode('utf-8') + '\t' + question_txt.encode('utf-8') + '\t' + answer_link.encode('utf-8') + '\n')

#            file_obj.write('\n')
            print answer_link
        file_obj.close()
コード例 #30
0
ファイル: okezone.py プロジェクト: mnafian/rojak
    def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        parsed_news = json.loads(str(response.body))[0]

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', parsed_news['url'])

        if not parsed_news['title']:
            # Will be dropped on the item pipeline
            return loader.load_item()
        loader.add_value('title', parsed_news['title'])

        # Convert HTML text to a scrapy response
        html_response = HtmlResponse(url=parsed_news['url'],
                body=parsed_news['content'].encode('utf-8', 'ignore'))
        xpath_query = '''
            //body/node()
                [not(descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::image|
                    descendant-or-self::img|
                    descendant-or-self::iframe
                )]
        '''
        raw_content_selectors = html_response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        loader.add_value('raw_content', raw_content)

        if not parsed_news['published']:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: 12 Oct 2016 - 05:25
        date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str,
                    '%d %b %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        if not parsed_news['author']:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', parsed_news['author'])

        # Move scraped news to pipeline
        return loader.load_item()
コード例 #31
0
def spider():
    body = open('test.html').read()
    response = HtmlResponse(url='http://example.com',
                            body=body.encode('utf-8'))
    results = []

    for company in response.xpath('//div[@class="media"]'):
        middle = company.xpath('.//li[@class="text-muted"]\
                                /span/text()').extract()[1:]
        result = dict(
            title = company.xpath('.//h4[@class="media-heading"]\
                                /a/text()'                                          ).extract_first(),
            site = company.xpath('.//a[@class="company-site"]\
                                /@href'                                       ).extract_first(),
            logo = company.xpath('.//div[@class="img-warp"]/a[@class= \
                    "company-logo"]/@style'                                           ).re_first\
                    ("background-image: url\('(.+)'\)"),
            desc = company.xpath('.//p[@class="company-desc"]\
                                /text()'                                        ).extract_first(),
            location = company.xpath('.//li[@class="text-muted "]\
                                /span/text()'                                             ).extract_first(),
            field = middle[0] if middle else ''
        )
        results.append(result)
    with open('../datas/companies.json', 'w') as f:
        f.write(json.dumps(results))
コード例 #32
0
    def parse(self, response):
        items = json.loads(response.body.decode('utf-8'))['items']

        pub_dt = None
        for i in items:
            resp = HtmlResponse(url='', body=i['html'], encoding='utf8')

            link = resp.xpath('//a/@href').extract()[0]
            pub_dt = datetime.fromtimestamp(i['publish_date_t'])

            if pub_dt.date() >= self.until_date:
                yield scrapy.Request(url=link,
                                     callback=self.parse_document,
                                     meta={"pub_dt": pub_dt})

        # Requesting page if publication date of the last article is above "until_date"
        if pub_dt and pub_dt.date() >= self.until_date:
            # Forming the next page link
            link_url = self.link_tmpl.format(int(pub_dt.timestamp()))

            yield scrapy.Request(url=link_url,
                                 priority=100,
                                 callback=self.parse,
                                 meta={'page_depth': response.meta.get('page_depth', 1) + 1}
                                 )
コード例 #33
0
ファイル: jianshu.py プロジェクト: umsung/scrapy
 def parse(self, response: HtmlResponse):
     """
     http://
     ajx : print(response.body.decode('utf-8'))  获取源代码
     """
     fans_list = response.xpath('//ul[@class="user-list"]//li')
     for fans in fans_list:
         item = {}
         # 粉丝名字
         item['fans_name'] = fans.xpath(
             './div[@class="info"]/a/text()').extract_first('')
         # 粉丝数量
         item['fans_sum'] = fans.xpath(
             './div[@class="info"]/div[1]/span[1]/text()').extract_first('')
         # 链接
         fans_href = fans.xpath(
             './div[@class="info"]/a/@href').extract_first('')
         fans_href = fans_href.split('/')[-1]
         fans_href = 'https://www.jianshu.com/users/' + fans_href + '/followers?page={}'
         count = int(
             fans.xpath('./div[@class="info"]/div[1]/span[2]/text()').re(
                 '粉丝 (.*)')[0])
         for i in range(1, count // 9 + 1):
             yield scrapy.Request(url=fans_href.format(i),
                                  callback=self.parse)
         yield item
コード例 #34
0
 def vacancy_parse(selfself, response: HtmlResponse):
     name_job = response.xpath('//h1/text()').extract_first()
     salary_job = response.xpath(
         '//span[@class="_1OuF_ ZON4b"]//text()').extract()
     location_job = response.xpath(
         '//div[@class="f-test-address _3AQrx"]//text()').extract()
     position_link = response.url
     company_job = response.xpath(
         '//span[@class="_3mfro _1hP6a _2JVkc _2VHxz"]/text() |'
         ' //h2[@class="_3mfro PlM3e _2JVkc _2VHxz _3LJqf _15msI"]/text()'
     ).extract_first()
     yield JobparserItem(name=name_job,
                         salary=salary_job,
                         location=location_job,
                         link=position_link,
                         company=company_job)
コード例 #35
0
ファイル: news.py プロジェクト: M-Arthur-A/Scraping_projects
 def parse(self, response: HtmlResponse):
     if response.url == 'https://www.kommersant.ru/rubric/3':
         rubric = 'economics'
     else:
         rubric = 'finance'
     topics = response.xpath(
         '//div[@class="grid_cell grid_cell_big js-middle"]//h4[contains(@class, "uho")]/a//text()').extract()
     resumes = response.xpath(
         '//div[@class="grid_cell grid_cell_big js-middle"]//h3[contains(@class, "uho")]/a//text()').extract()
     hrefs = response.xpath(
         '//div[@class="grid_cell grid_cell_big js-middle"]//h3[contains(@class, "uho")]/a/@href').extract()
     for i, href in enumerate(hrefs):
         variable = {'rubric': rubric, 'topic': topics[i], 'resume': resumes[i], 'href': href}
         yield response.follow(href,
                               callback=self.get_item,
                               meta={'attrs': deepcopy(variable)})
    def parse(self, response: HtmlResponse):
        # '/html/body/div[1]/div[2]/div[2]/div[3]/div[4]/div/div[1]/div[2]/div/div[2]/div[1]/div[1]/div[1]/h3/a'
        # urls = response.xpath('//div[contains(@data-marker, "item")]/div[@class="item__line"]//h3/a[@itemprop="url"]')

        for url in response.xpath(
                '//div[contains(@data-marker, "item")]/div[@class="item__line"]//h3/a[@itemprop="url"]'):
            yield response.follow(url, callback=self.avd_parse)
コード例 #37
0
 def parse(self, response):
     response = HtmlResponse(url=self.shops_root_url, body=response.body)
     all_links = response.xpath('*//a/@href').extract()
     link_index = 0
     for link in all_links:
       link_index = link_index + 1
       yield SplashRequest(url=self.shops_root_url, callback=self.parse_via_pages, endpoint='execute', args={'lua_source': script,'link_index':link_index})
コード例 #38
0
ファイル: zhihu.py プロジェクト: TomaseLiu/myzhihu
    def parse_page(self, response):
        #print "test point"
        response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body)
        url = response.url
        name = response.xpath('//h2[@class="zm-item-title zm-editable-content"]/text()').extract()
        context_list = response.xpath('//div[@class="zm-editable-content"]/text()').extract()
        print name[0]
        for context in context_list:
            print context

        answer_num = response.xpath('//h3/@data-num').extract()
        if len(answer_num) == 0:
            print 1
        else:
            print answer_num[0]

        author_list = response.xpath('//*[@class="author-link"]/text()').extract()
        for author in author_list:
            print author
コード例 #39
0
    def parse(self, response):
        response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body)
        url = response.url
        title = " ".join(response.xpath('/html/body/div[@class="container"]/div[@class="bodybox"]/div[1]/div[2]/h1/text()').extract())
        header = response.xpath('/html/body/div[@class="container"]/div[@class="bodybox"]/div[1]/div[2]/h2/text()').extract()
        if len(header) <= 1:
            return

        paragraph_list = response.xpath('/html/body/div[@class="container"]/div[@class="bodybox"]/div[1]/div[2]/div[2]/p/text()').extract()
        body = '\n'.join(paragraph_list)

        time_source = header[0]
        related_industry = ""
        related_theme = ""
        
        ri_key = "关联行业"
        rt_key = "关联概念"


        for item in header:
            print item
            if ri_key.decode('utf-8') in item:
                related_industry =  item.split(u':')[1]
                #continue
            if rt_key.decode('utf-8') in item:
                related_theme = item.split(u':')[1]

        if related_theme == "":
            return        

        file_name = './' + url.split('_')[1] + '.ycj'
        #print url, title, related_theme, body       

        news = News()
        news['title'] = title
        news['url'] = url
        news['time_source'] = time_source
        news['related_industry'] = related_industry
        news['related_theme'] = related_theme
        news['body'] = body
        return news
コード例 #40
0
 def __parseHotelRoomInfo(self, page_source, hotel_id):
     response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8")
     hotel_price_list_len = len(response.xpath("//div[@class='hotel_price_body']/div"))
     hotel_price_body_dom = response.xpath("//div[@class='hotel_price_body']")
     crawl_time = datetime.datetime.now().strftime('%Y-%m-%d')
     if hotel_price_list_len < 2:
         return False
     else:
         for i in range(2, hotel_price_list_len+1):
             room_item_list_len = len(hotel_price_body_dom.xpath("div[%d]/div[@class='fleft s2']/div[@class='item']"%i))
             room_item_list = hotel_price_body_dom.xpath("div[%d]/div[@class='fleft s2']"%i)
             room_name = hotel_price_body_dom.xpath("div[%d]/div[@class='fleft s1']/div/p[@class='name']/text()"%i).extract()[0]
             if room_item_list_len > 0:
                 for j in range(1, room_item_list_len+1):
                     description = room_item_list.xpath("div[%d]/div[@class='m1 fleft']/span/text()"%(j+1)).extract()[0]
                     bed_type = room_item_list.xpath("div[%d]/div[@class='m2 fleft']/span/text()"%(j+1)).extract()[0]
                     breakfast = room_item_list.xpath("div[%d]/div[@class='m3 fleft']/span/text()"%(j+1)).extract()[0]
                     wifi = room_item_list.xpath("div[%d]/div[@class='m4 fleft']/span/a/text()"%(j+1)).extract()[0]
                     cancel_policy = room_item_list.xpath("div[%d]/div[@class='m5 fleft']/span/a/text()"%(j+1)).extract()[0]
                     price = room_item_list.xpath("div[%d]/div[@class='m6 fleft']//span[@class='digit']/text()"%(j+1)).extract()[0]
                     self.priceList.append({"guid":uuid.uuid1(),"room_name":room_name, "description":description, "bed_type":bed_type, "breakfast":breakfast, "wifi":wifi, "cancel_policy":cancel_policy, "price": int(price),"crawl_time":crawl_time, "hotel_id":hotel_id})
コード例 #41
0
ファイル: ufos_spider.py プロジェクト: Amanda29/big-data
 def parse(self, response):
    #pprint.pprint("------------------------------")
    cadena_temp_1 = response.body.split("<TABLE  CELLSPACING=1>")
    cadena_temp_1 = cadena_temp_1[1].split("</TABLE>")
    cadena_temp_1[0] = '<HTML><BODY><TABLE  CELLSPACING=1>'.lower() + cadena_temp_1[0].lower() + '</TABLE></BODY></HTML>'.lower()
    response_2 = HtmlResponse(url="http://nuforc.org/webreports/ndxevent.html", body=cadena_temp_1[0])
    for registro in response_2.xpath('.//body/table/tbody/tr'):
         item_tabla = CrawlerUfoItem()
         item_tabla['report_href'] = registro.xpath('td[1]/font/a/@href').extract()[0]
         item_tabla['report_text'] = registro.xpath('td[1]/font/a/text()').extract()[0]
         item_tabla['count'] = registro.xpath('td[2]/font/text()').extract()[0]
         #pprint.pprint(item_tabla)
         url_nuevo =	'http://nuforc.org/webreports/' + item_tabla['report_href']
         yield scrapy.Request(url_nuevo, body = "", method = 'GET', headers={"content-type":"application/x-www-form-urlencoded"}, callback = self.parse_2, dont_filter = True)
コード例 #42
0
 def __parseHotelComment(self, page_source, hotel_id, comm_type):
     response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8")
     remarkDom = response.xpath("//div[@class='user_remark_datail']")
     remarkDomLen = len(response.xpath("//div[@class='user_remark_datail']/div"))
     # 记录抓取页的评论内容跟已保存评论的相同数目
     same_num = 0
     for i in range(1, remarkDomLen+1):
         id = uuid.uuid1()
         # 用户名
         username = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b2']/text()"%i).extract()
         username = username[0] if len(username) > 0 else ""
         # 评论文本
         remarkText = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b2']/p/text()"%i).extract()
         remark = ""
         for str in remarkText:
             remark = remark + re.sub("\s+", "", str)
         # 评论时间
         comm_time = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b4']/div[@style='float: right;']/text()"%i).extract()[0]
         # 用户类型
         user_type = ""
         senti_value = None
         viewpoint = None
         try:
             user_type = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b3']/text()"%i).extract()[0]
             senti_value = self.hotelNLP.sentiment(remark.encode("utf-8"))
             viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8"))
         except:
             traceback.print_exc()
         comm = {"guid":id, "username":username, "remark":remark, "comm_time":comm_time, "user_type":user_type, "hotel_id":hotel_id, "comm_type":comm_type, "senti_value":senti_value, "viewpoint":viewpoint}
         if self.__is_exist_in_comment_list(comm):
             same_num += 1
         else:
             self.commList.append(comm)
     if same_num == remarkDomLen:
         return False
     else:
         return True
コード例 #43
0
ファイル: justice.py プロジェクト: iandees/all-the-places
    def parse(self, response):
        data = json.loads(response.body_as_unicode())
        stores = data['markers']                            
        for store in stores:                                 
            html = HtmlResponse(
                url="", 
                body=store['info'].encode('UTF-8')
            )

            unp = {}
            unp['lat'] = store['lat']
            unp['lon'] = store['lng']

            if unp['lat']: unp['lat'] = float(unp['lat'])
            if unp['lon']: unp['lon'] = float(unp['lon'])

            unp['ref'] = store['locationId']
            unp['addr_full'] = html.xpath('//div[contains(@class, "addr")]/text()').extract_first()
            unp['phone'] = html.xpath('//div[contains(@class, "phone")]/text()').extract_first()
            unp['name'] = html.xpath('//div[@class="loc-name"]/text()').extract_first()
            addr2 = html.xpath('//div[contains(@class, "csz")]/text()').extract_first()
            if addr2:
                addr2 = addr2.strip()
                three_pieces = self.addr2regex.search(addr2)
                if three_pieces:
                    city, state, zipcode = three_pieces.groups()
                    unp['city'] = city
                    unp['state'] = state
                    unp['postcode'] = zipcode
           
            properties = {}                                                
            for key in unp:
                if unp[key]:
                    properties[key] = unp[key]

            yield GeojsonPointItem(**properties)             
コード例 #44
0
ファイル: ufos_spider.py プロジェクト: Amanda29/big-data
 def parse_3(self, response, item):
    cadena_temp_2 = response.body.lower().replace("<p>","")
    response = HtmlResponse(url=response.url, body=cadena_temp_2)
    #pprint.pprint(response.xpath('.//body/table/font/caption/b/text()').extract()[0])
    #pprint.pprint("******************************")
    pprint.pprint(item)
    for registro in response.xpath('.//body/table/tbody'):
        if not registro.xpath('tr[1]/td/font').extract():
            item["detalle1"] = ""
        else:
            item["detalle1"] = registro.xpath('tr[1]/td/font').extract()[0]
        if not registro.xpath('tr[2]/td/font').extract():
            item["detalle2"] = ""
        else:
            item["detalle2"] = registro.xpath('tr[2]/td/font').extract()[0]
        #pprint.pprint(item_tabla_3)
        yield item
コード例 #45
0
ファイル: ufos_spider.py プロジェクト: Amanda29/big-data
 def parse_2(self, response):
    cadena_temp_1 = response.body.split("<TABLE  CELLSPACING=1>")
    cadena_temp_1 = cadena_temp_1[1].split("</TABLE>")
    cadena_temp_1[0] = ('<HTML><BODY><TABLE  CELLSPACING=1>' + cadena_temp_1[0] + '</TABLE></BODY></HTML>').lower()
    response = HtmlResponse(url=response.url, body=cadena_temp_1[0])
    #pprint.pprint("++++++++++++++++++++++++++++++")
    for registro in response.xpath('.//body/table/tbody/tr'):
        item = Crawler_2Item()
        if not registro.xpath('td[1]/font/a/text()').extract():
            item["date_text"] = ""
        else:
            item["date_text"] = registro.xpath('td[1]/font/a/text()').extract()[0]
        if not registro.xpath('td[1]/font/a/@href').extract():
            item["date_href"] = ""
        else:
            item["date_href"] = registro.xpath('td[1]/font/a/@href').extract()[0]
        if not registro.xpath('td[2]/font/text()').extract():
            item["city"] = ""
        else:
            item["city"] = registro.xpath('td[2]/font/text()').extract()[0]
        if not registro.xpath('td[3]/font/text()').extract():
            item["state"] = ""
        else:
            item["state"] = registro.xpath('td[3]/font/text()').extract()[0]
        if not registro.xpath('td[4]/font/text()').extract():
            item["shape"] = ""
        else:
            item["shape"] = registro.xpath('td[4]/font/text()').extract()[0]
        if not registro.xpath('td[5]/font/text()').extract():
            item["duration"] = ""
        else:
            item["duration"] = registro.xpath('td[5]/font/text()').extract()[0]
        if not registro.xpath('td[6]/font/text()').extract():
            item["summary"] = ""
        else:
            item["summary"] = registro.xpath('td[6]/font/text()').extract()[0]
        if not registro.xpath('td[7]/font/text()').extract():
            item["posted"] = ""
        else:
            item["posted"] = registro.xpath('td[7]/font/text()').extract()[0]
        #pprint.pprint(item_tabla_2) 
        url_nuevo =	'http://nuforc.org/webreports/' + item["date_href"]
        item["detalle1"] = ""
        item["detalle2"] = ""
        yield scrapy.Request(url_nuevo , body = "", method = 'GET', headers={"content-type":"application/x-www-form-urlencoded"}, dont_filter = True, callback = lambda r : self.parse_3(r, item) )
コード例 #46
0
ファイル: concentra.py プロジェクト: iandees/all-the-places
    def parse(self, response):
        data = json.loads(response.body_as_unicode())
        stores = data['Results']                            
        for store in stores:                                 
            url = 'https://www.concentra.com{}'.format(store['Url'])

            lat, lon = None, None
            if 'Geospatial' in store:
                geospatial = store['Geospatial']
                if 'Latitude' in geospatial:
                    lat = geospatial['Latitude']
                if 'Longitude' in geospatial:
                    lon = geospatial['Longitude']

            # Most of the data is stored as an html blob inside the json
            # so build a new HtmlResponse from it which we can parse.
            html = HtmlResponse(
                url=url, 
                body=store['Html'].encode('utf-8')
            )
            addr1 = html.xpath('//div[@class="field-addressline1"]/text()').extract_first()
            addr2 = html.xpath('//div[@class="field-addressline2"]/text()').extract_first()
            postcode = html.xpath('//span[@class="field-zipcode"]/text()').extract_first()
            phone = html.xpath('//div[@class="field-mainphone"]/text()').extract_first()
            state = html.xpath('//span[@class="field-stateabbreviation"]/text()').extract_first()
            city = html.xpath('//div[@class="field-centername"]/text()').extract_first()
            name = html.xpath('//div[@class="location-clinic-link"]/a/@title').extract_first()

            if addr1: addr1 = addr1.strip()
            if addr2: addr2 = addr2.strip()

            addr_full = None
            if addr1 and addr2:
                addr_full = ' '.join([addr1, addr2])
            elif addr1:
                addr_full = addr1
            
            properties = {}                                                
            properties['ref'] = store['Id']
            properties['website'] = url

            if addr_full: properties['addr_full'] = addr_full
            if name: properties['name'] = name
            if city: properties['city'] = city
            if state: properties['state'] = state
            if postcode: properties['postcode'] = postcode
            if phone: properties['phone'] = phone.replace('.', '-')
            if lat: properties['lat'] = lat
            if lon: properties['lon'] = lon
                                                             
            yield GeojsonPointItem(**properties)             
コード例 #47
0
 def __parseUrls(self,page_source):
     response = HtmlResponse(url="My HTML String",body=page_source,encoding="utf-8")
     hotel_list = response.xpath("//div[@class='h_list']/div[@class='h_item']")
     for hotel in hotel_list:
         url = hotel.xpath(".//p[@class='h_info_b1']/a/@href").extract()[0]
         name = hotel.xpath(".//p[@class='h_info_b1']/a/@title").extract()[0]
         address = hotel.xpath(".//p[@class='h_info_b2']/text()").extract()[1]
         commnum = hotel.xpath(".//div[@class='h_info_comt']/a/span[@class='c555 block mt5']/b/text()").extract()
         if len(commnum)==0:
             commnum = 0
         else:commnum = commnum[0]
         self.listPageInfo.append({
             "guid": uuid.uuid1(),
             "url": url,
             "hotel_name": name,
             "OTA": self.__ota_info,
             "comm_num": commnum,
             "address": address
         })
         pass
コード例 #48
0
    def crawllianjie(self,page_sourse):
        response = HtmlResponse(url="my HTML string",body=page_sourse,encoding="utf-8")

        A = response.xpath("//div[@class='searchresult_list ']/ul")
        # 获取每个酒店的链接
        for B in A:
            url = B.xpath("li[@class='searchresult_info_name']/h2/a/@href").extract()
        # 评论
            commnum = B.xpath("li[@class='searchresult_info_judge ']/div/a/span[@class='hotel_judgement']/text()").extract()
            if len(commnum):
                Discuss = re.sub('\D','',commnum[0])
                if len(Discuss):
                    pass
                else:
                    Discuss = 0
            else:
                Discuss = 0
            self.listPageInfo.append({"url":url[0], "comm_num":Discuss, "city":"南京"})
        xiechengService.saveListPageInfo()
        if len(self.listPageInfo) == 25:
            pass
        else:
            print len(self.listPageInfo)
        self.listPageInfo = []
コード例 #49
0
                print e.code
                print e.read()
                continue
        except urllib2.URLError,e:
                print e.code
                print e.read()
                continue
        except httplib.HTTPException,e:
                print e.code()
                print e.read()
                continue
        post_soup = BeautifulSoup(post_page, "lxml")
        post_response = HtmlResponse(url=page_url.url, body=str(post_soup))
        path = old_path
        version = 0 # 0: old 1:new
        reply_num = post_response.xpath(path['reply_num']).extract()
        if reply_num == []:
            version = 1
            #print "new verion!"
            path = new_path
            reply_num = post_response.xpath(path['reply_num']).extract()
            if reply_num == []:
                reply_num = 0
            else:
                reply_num = int(reply_num[0].strip().split(' ')[0])
        else:
            reply_num = int(reply_num[0].strip().split(' ')[0])
            #print "old version"

        if reply_num == 0:
            continue
コード例 #50
0
    def crawlHotelInfo(self,target):
        #target来自于baseinfo表
        url = target[1]
        self.openPage("http://hotel.elong.com"+url)

        self.wait(3)
        time.sleep(random.uniform(1,3))
        # 如果网址失效 return

        if self.isAlertPresent():
            return False

        response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8")
        # 解析酒店页面信息
        if self.if_crawl_hotel_info is True:
            self.__parseHotelInfo(response, target)
            pass

        # 解析酒店房间信息
        if self.if_crawl_hotel_price is True:
            record_time = 0
            while 1:
                try:
                    self.priceList = []
                    self.__parseHotelRoomInfo(self.driver.page_source, target[0])
                    break
                except:
                    time.sleep(2)
                    record_time += 1
                if record_time > 3:
                    break
        # 抓取酒店点评
        if self.if_crawl_hotel_comment is True:
            self.commList = []
            # TODO 若评论数大于0   冗余可改
            if target[4]>0:
                self.driver.find_element_by_xpath("//body").send_keys(Keys.END)
                self.wait(2)
                read_time = 0
                comm_page_num = ''
                while 1:
                    try:
                        response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8")
                        hotelname = ""
                        page_a = response.xpath("//div[@id='comment_paging']/a")

                        if len(page_a)==1 :
                            comm_page_num = page_a.xpath(".//text()").extract()[0]
                        if len(page_a)>1:
                            comm_page_num = page_a.xpath(".//text()").extract()[-2]
                    except Exception , e:
                        print e
                    time.sleep(1)
                    read_time+=1
                    if read_time>10:#10次等待
                        break
                    if comm_page_num != '':
                        break

                print "评论共有:",comm_page_num,"页"
                if self.__crawlHotelComment(self.driver, target[0], comm_page_num):
                    print ""
                    print "共抓取了",len(self.commList),"条评论,存储到commList中"
コード例 #51
0
ファイル: t.py プロジェクト: openslack/openslack-web
# coding=utf-8
import re
import json

import requests
from scrapy.http import HtmlResponse

response = HtmlResponse(
    url='http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=oIWsFt_Id9NTbaO6ms2zvSBm2RzI&eqs=qBsQoCeguK%2B0ofdI%2B6h3FuvrCqfh1RlwTme4vOefG9aBeZd%2BPz%2FN4dn91sq5UJD2r2xev&ekv=3&page=1')
# response.selector.xpath('//span/text()').extract()
# response.xpath('//title/text()')
# Selector(response=response).xpath('//span/text()').extract()
content = requests.get(response.url).content
# doc=u"""
# <span id="J_realContact" data-real="电话021-60131333 传真021-60131356 &nbsp;&nbsp; <a target='_blank' href='http://my.ctrip.com/uxp/Community/CommunityAdvice.aspx?producttype=3&categoryid=65'>纠错</a>" style="color:#0066cc;cursor:pointer;">联系方式</span> 
# """
# print content
# regex = re.compile(r"sogou\.weixin\.gzhcb\((.*\])\}\)")
# print regex.findall(content)

content = re.search(r'\{.*\]\}', content).group()
docs = ""
for i in json.loads(content)["items"]:
    docs += i
se = HtmlResponse(url="http://www.qq.com", body=docs, encoding="utf8")
print se.xpath("//item//docid/text()").extract()
コード例 #52
0
 def __parseHotelRoomInfo(self, page_source, hotel_id):
     response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8")
     rooms_list = response.xpath("//div[@class='htype_list']/div")
     rooms_list_len = len(rooms_list)
     if rooms_list_len<0:
         return False
     crawl_time = datetime.datetime.now().strftime('%Y-%m-%d')
     for rooms in rooms_list:
         #房间名称
         roomname = rooms.xpath(".//p[@class='htype_info_name']/span/text()").extract()[0]
         #房间大小
         roomarea = rooms.xpath(".//p[@class='htype_info_ty']/span[1]/text()").extract()
         if len(roomarea)!=0:
             roomarea = roomarea[0]
         else:
             roomarea = ''
         #床型
         bedtype = rooms.xpath(".//p[@class='htype_info_ty']/span[3]/text()").extract()
         if len(bedtype)!=0:
             bedtype = bedtype[0]
         else:
             bedtype = ''
         #人数
         havenum = rooms.xpath(".//p[@class='htype_info_ty']/span[3]/span/text()")
         if havenum:
             peoplecount = str(havenum.extract()[0])
         else:
             peoplecount = str(len(rooms.xpath(".//p[@class='htype_info_ty']/span[5]/i")))
         if peoplecount == '0':
             peoplecount = '未说明'
         #楼层
         roomsfloor = rooms.xpath(".//p[@class='htype_info_ty']/span[7]/text()").extract()
         if len(roomsfloor)!=0:
             roomsfloor = roomsfloor[0]
         else:
             roomsfloor = ''
         havewifi = rooms.xpath(".//p[@class='htype_info_ty']/span[9]/text()").extract()
         if len(havewifi)!=0:
             havewifi = havewifi[0]
         else:
             havewifi = ''
         list = rooms.xpath(".//table[@class='htype-table']/tbody/tr[@data-handle='rp']")
         descriptions = rooms.xpath(".//td[@class='ht_other']//p/text()").extract()
         description = ''
         for d in descriptions:
             dstrip = d.strip()
             if u'查看更多产品报价' != dstrip:
                 description +=dstrip
         for room in list:
             roomtype = room.xpath(".//td[@class='ht_name']/span/text()").extract()[0]
             supply = room.xpath(".//td[@class='ht_supply']/text()").extract()[0].strip()
             breakfast =  room.xpath(".//td[@class='ht_brak']/text()").extract()[0]
             rule = room.xpath(".//td[@class='ht_rule']/span/text()").extract()[0]
             price = room.xpath(".//td[@class='ht_pri']/span[@class='ht_pri_h cur']/span/text()").extract()[0]
             self.priceList.append({
                 'guid':uuid.uuid1(),
                 'room_name':roomname,
                 'room_area':roomarea,
                 'bed_type':bedtype,
                 'people_count':peoplecount,
                 'rooms_floor':roomsfloor,
                 'wifi':havewifi,
                 'description':description,
                 'room_type':roomtype,
                 'supply':supply,
                 'breakfast':breakfast,
                 'cancel_policy':rule,
                 'price':price,
                 'crawl_time':crawl_time,
                 'hotel_id':hotel_id
             })
             pass
コード例 #53
0
    def getcommentinfo(self,page_sourse):
        response = HtmlResponse(url="my HTML string",body=self.driver.page_source,encoding="utf-8")
        commentData = response.xpath("//div[@class='comment_detail_list']/div[@class='comment_block J_asyncCmt']")
        title = response.xpath("//div[@class='main_detail_wrapper ']/div[@class='detail_main detail_main_no_tips']/div[@class='htl_info']/div[@class='name']/h2[@class='cn_n']/text()").extract()
        if len(title):
            Title = title
        else:
            Title = response.xpath("//div[@class='main_detail_wrapper ']/div[@class='detail_main detail_main_no_comment']/div[@class='htl_info']/div[@class='name']/h2[@class='cn_n']/text()").extract()
        for itemData in commentData:
            itemDict = dict()

            # 酒店名
            hotelname = Title
            if len(hotelname):
                hotelnames = hotelname[0]
            else:
                hotelnames = " "


            # 用户名
            username = itemData.xpath("div[1]/p[2]/span/text()").extract()
            if len(username):
                usernames = username[0]
            else:
                usernames = ""

            # 评论分
            commentscore = itemData.xpath("div[2]/p/span[2]/span/text()").extract()
            if len(commentscore):
                commentscores = commentscore[0]
            else:
                commentscores = ""

            # 入住时间
            intime = itemData.xpath("div[2]/p/span[3]/text()").extract()
            if len(intime):
                intimes = intime[0]
            else:
                intimes = ""

            # 出游类型
            tourstyle = itemData.xpath("div[2]/p/span[4]/text()").extract()
            if len(tourstyle):
                tourstyles = re.sub('\w','',tourstyle[0])
            else:
                tourstyles = ""

            # 点赞数量
            praisenum = itemData.xpath("div[2]/div[@class='comment_txt']/div[@class='comment_bar']/a/span/text()").extract()
            if len(praisenum):
                Praisenum = re.sub('\D','',praisenum[0])
                praisenums = Praisenum
            else:
                praisenums = ""

            # 评论发表时间
            commenttime = itemData.xpath("div[2]/div[@class='comment_txt']/div[@class='comment_bar']/p/span/text()").extract()
            if len(commenttime):
                commenttimes = commenttime[0].split(u"于")[1]
            else:
                commenttimes = ""

            # 评论内容
            comment = itemData.xpath("div[2]/div[@class='comment_txt']/div[1]/text()").extract()
            if len(comment):
                comments = comment[0]
            else:
                comments = ""
            self.commList.append({"title":hotelnames,"username":usernames,"commentscore":commentscores, "intime":intimes, "tourstyle":tourstyles, "praisenum":praisenums,"commenttime":commenttimes,"comment":comments})