コード例 #1
0
    def parse_itme(self, response):
        article_p = response.xpath('//div[@class="content"]//p')
        title_index = []
        for i, p in enumerate(article_p):
            if p.xpath("./strong").extract():
                title_index.append(i)

        for s_index, i_item in enumerate(title_index):
            category = response.xpath("//h1/text()").extract()[0]
            title = article_p[i_item].xpath("./strong/text()").extract()[0]
            try:
                next_tile = title_index[s_index + 1]
                content_str = [
                    "".join(item.xpath("./text()").extract())
                    for item in article_p[i_item:next_tile]
                ]
            except Exception as e:
                content_str = [
                    "".join(item.xpath("./text()").extract())
                    for item in article_p[i_item:]
                ]

            item = TangspiderframeItem()
            item['url'] = response.url
            item['category'] = category
            item['content'] = "".join([item.strip() for item in content_str])
            item['title'] = title
            yield item
コード例 #2
0
 def parse(self, response):
     category = response.xpath('//div[@id="place"]/a[2]/text()').extract()[0]
     title = response.xpath('//div[@id="info"]/dl/h1/a/text()').extract()[0]
     content = response.xpath('//dl[@id="zi"]//p//text()').extract()
     item = TangspiderframeItem()
     item['url'] = response.url
     item['category'] = category
     item['content'] = "".join([item.strip() for item in content])
     item['title'] = title
     yield item
コード例 #3
0
 def parse(self, response):
     category = response.xpath(
         '//div[@class="position"]/a[2]/text()').extract()[0]
     title = response.xpath(
         '//div[@class="article"]/h1/text()').extract()[0]
     content = response.xpath('//div[@class="article"]/p//text()').extract()
     item = TangspiderframeItem()
     item['url'] = response.url
     item['category'] = category
     item['content'] = "".join([item.strip() for item in content])
     item['title'] = title
     yield item
コード例 #4
0
    def parse(self, response):
        title = response.xpath('//h1//text()').extract()
        content = response.xpath('//p/text()').extract()
        last1_link = response.xpath('//p[last()]//@href').extract()
        last2_link = response.xpath('//p[last()-1]//@href').extract()
        content = ''.join(content)
        content = content.replace("\n", "  ")
        content = content.replace("\t", "  ")
        if last1_link and last2_link:
            last1_content = response.xpath('//p[last()]/text()').extract()
            last2_content = response.xpath('//p[last()-1]/text()').extract()
            last1_content = ''.join(last1_content)
            last2_content = ''.join(last2_content)
            content = content.replace(last1_content, "")
            content = content.replace(last2_content, "")
        elif last1_link and not last2_link:
            last1_content = response.xpath('//p[last()]/text()').extract()
            last1_content = ''.join(last1_content)
            content = content.replace(last1_content, "")
        elif last2_link and not last1_link:
            last2_content = response.xpath('//p[last()-1]/text()').extract()
            last2_content = ''.join(last2_content)
            content = content.replace(last2_content, "")
        else:
            content = content

        item = TangspiderframeItem()
        item['url'] = response.url
        item['category'] = response.url.split('/')[3]
        item['title'] = ''.join(title)
        item['content'] = content
        # print(item)
        yield item
コード例 #5
0
 def parse(self, response):
     hrefs = re.findall('<td ><a href=(.*?)>', response.text)
     for href in hrefs:
         url = "http://kids.hankooki.com/community/" + href
         item = TangspiderframeItem()
         item['url'] = url
         yield item
コード例 #6
0
 def parse(self, response):
     links = response.xpath('//article/a/@href').extract()
     for link in links:
         item = TangspiderframeItem()
         item['url'] = link
         # print(item)
         yield item
コード例 #7
0
 def parse(self, response):
     links = response.xpath('//div[@class="box"]//ul/li//a//@href').extract()
     for link in links:
         item = TangspiderframeItem()
         item['url'] = "http://www.enet.gr/"+link
         # print(item)
         yield item
コード例 #8
0
    def parse(self, response):
        links = response.xpath(
            '//div[@class="lx-stream__feed"]//a/@href').extract()
        links.extend(
            response.xpath('//div[@class="mpu-available"]//a/@href').extract())
        links.extend(
            response.xpath('//div[@role="region"]//a/@href').extract())

        urls = [
            "https://www.bbc.com" + link for link in links
            if "http" not in link
        ]
        urls = list(set(urls))
        for url in urls:
            item = TangspiderframeItem()
            item["url"] = url
            yield item

        type_id = self.type_id
        t = self.t
        base_url = "https://push.api.bbci.co.uk/p?"
        one_arg = "morph://data/bbc-morph-lx-commentary-latest-data/assetUri/news%2Flive%2F{}-{}/limit/31/version/4.1.27".format(
            t, type_id)
        one_url = "t={}&c=1".format(quote(one_arg, safe=''))
        frist_url = base_url + one_url

        two_arg = "morph://data/bbc-morph-lx-commentary-data/assetUri/news%2Flive%2F{}-{}/limit/31/version/5.0.24/withPayload/11".format(
            t, type_id)
        two_url = "t={}&c=1".format(quote(two_arg, safe=''))
        second_url = base_url + two_url

        three_arg_1 = "morph://data/bbc-morph-feature-toggle-manager/assetUri/news%2Flive%2F{}-{}/featureToggle/dot-com-ads-enabled/project/bbc-live/version/1.0.3".format(
            t, type_id)
        three_arg_2 = "morph://data/bbc-morph-feature-toggle-manager/assetUri/news%2Flive%2F{}-{}/featureToggle/lx-old-stream-map-rerender/project/bbc-live/version/1.0.3".format(
            t, type_id)
        three_arg_3 = "morph://data/bbc-morph-feature-toggle-manager/assetUri/news%2Flive%2F{}-{}/featureToggle/reactions-stream-v4/project/bbc-live/version/1.0.3".format(
            t, type_id)
        three_arg_4 = "morph://data/bbc-morph-lx-commentary-latest-data/assetUri/news%2Flive%2F{}-{}/limit/21/version/4.1.27".format(
            t, type_id)
        three_arg_5 = "morph://data/bbc-morph-lx-commentary-latest-data/assetUri/news%2Flive%2F{}-{}/limit/31/version/4.1.27".format(
            t, type_id)
        arg = []
        for item in [
                three_arg_1, three_arg_2, three_arg_3, three_arg_4, three_arg_5
        ]:
            arg.append(quote(item, safe=''))

        three_url = "t={}&c=1&t={}&c=1&t={}&c=1&t={}&c=1&t={}&c=1".format(*arg)
        three_url = base_url + three_url

        yield scrapy.Request(url=frist_url,
                             callback=self.parse_item,
                             dont_filter=True)
        yield scrapy.Request(url=second_url,
                             callback=self.parse_item,
                             dont_filter=True)
        yield scrapy.Request(url=three_url,
                             callback=self.parse_item,
                             dont_filter=True,
                             meta={"page": 31})
コード例 #9
0
 def parse_item(self, response):
     links = response.xpath('//li[@class="hh"]/a/@href').extract()
     for link in links:
         url = "https:" + link
         item = TangspiderframeItem()
         item['url'] = url
         yield item
コード例 #10
0
    def parse(self, response):
        # 结构化内容
        content = response.xpath("//script/text()").extract()
        data_s = content[1].replace("window.$$data=", "")
        data = json.loads(data_s)

        # 抓取对话
        questionDialog = data.get("questionDialog")

        def parse_dialog(dialog_data):
            # 对话解析
            dialogs = []
            for dialog in dialog_data:
                dialog_type = dialog.get("type")
                dialog_content = dialog.get("content")
                dialog_content = re.sub("<.*?>|\n|\t|\r", "", dialog_content)
                if not dialog_content:
                    dialog_content = dialog.get("dialog_audio",
                                                {}).get("audio_text")
                if dialog_type:
                    new_content = "doctor:" + dialog_content
                else:
                    new_content = "patient:" + dialog_content
                dialogs.append(new_content)
            return "\t".join(dialogs)

        dialog = parse_dialog(questionDialog)
        dialog = self.delete_emoji(dialog)

        disease = data.get("disease", {}).get("title")
        item = TangspiderframeItem()
        item['url'] = response.url
        item['category'] = disease
        item['content'] = dialog
        yield item
コード例 #11
0
 def parse(self, response):
     link_arg = response.xpath('//a[@class=" ga-link"]/@href').extract()
     for link in link_arg:
         url = "https://www.ted.com" + link
         head_url, language = url.split("?")
         item = TangspiderframeItem()
         item['url'] = head_url + "/transcript" + "?" + language
         yield item
コード例 #12
0
 def parse(self, response):
     links = response.xpath(
         '//a[@class="iframe cboxElement"]/@href').extract()
     for link in links:
         item = TangspiderframeItem()
         item['url'] = link
         # print(item)
         yield item
コード例 #13
0
    def parse(self, response):
        products = response.xpath('//div[@class="tit-list"]/div')

        for product in products:
            product_url = ''.join(product.xpath('.//a/@href').extract())
            item = TangspiderframeItem()
            item["url"] = product_url
            yield item
コード例 #14
0
 def parse(self, response):
     resp = json.loads(response.text)
     data = resp.get("data")
     links = [item.get("metadata", {}).get("url") for item in data]
     for link in links:
         item = TangspiderframeItem()
         item['url'] = link
         yield item
コード例 #15
0
    def parse(self, response):
        links = response.xpath('//h4/a//@href').extract()
        for pattern in links:
            link = "https://www.vientianemai.net" + pattern

            item = TangspiderframeItem()
            item['url'] = link
            # print(item)
            yield item
コード例 #16
0
 def parse(self, response):
     title = response.xpath("//h1/text()").extract()
     contents = response.xpath('//div[@id="Content"]/p//text()').extract()
     paragraph = [content for content in contents]
     item = TangspiderframeItem()
     item['category'] = "opinion"
     item['url'] = response.url
     item['title'] = title[0].strip() if title else ""
     item['content'] = " ".join(paragraph)
     yield item
コード例 #17
0
 def parse(self, response):
     print(response.text)
     data = json.loads(response.text)
     items = data.get("data").get("items")
     for item in items:
         q_id = item.get("question", {}).get("id")
         url = "https://ask.dxy.com/question/{}".format(q_id)
         item = TangspiderframeItem()
         item['url'] = url
         yield item
コード例 #18
0
    def parse(self, response):
        word = response.xpath("//h2/text()").extract()[0].replace(
            "的成语", "").replace("的组词", "")
        url = response.url
        if "com/zi" in url:
            item_li = response.xpath('//div[@class="mcon f14"]/ul/li')
            for li in item_li:
                phrase = li.xpath("./a[1]/text()").extract()
                pronunciation = li.xpath("./a[2]/text()").extract()
                item = TangspiderframeItem()
                item['url'] = response.url
                item['category'] = word
                item['title'] = pronunciation[0].strip()
                item['content'] = phrase[0]
                yield item
            next_page = response.xpath(
                '//a[contains(text(),  "下一页")]/@href').extract()
            if next_page:
                next_url = "http://zuci.miaochaxun.com/" + next_page[0]
                yield scrapy.Request(url=next_url,
                                     meta={"word": response.meta.get("word")},
                                     callback=self.parse,
                                     dont_filter=True)

        if "com/zuci" in url:
            item_li = response.xpath('//div[@class="mcon bt"]/ul/li')
            for li in item_li:
                phrase = li.xpath("./a[1]/text()").extract()
                item = TangspiderframeItem()
                item['url'] = response.url
                item['category'] = word
                item['content'] = phrase[0]
                yield item

            next_page = response.xpath(
                '//a[contains(text(),  "下一页")]/@href').extract()
            if next_page:
                next_url = "http://chengyu.miaochaxun.com/" + next_page[0]
                yield scrapy.Request(url=next_url,
                                     meta={"word": response.meta.get("word")},
                                     callback=self.parse,
                                     dont_filter=True)
コード例 #19
0
    def parse(self, response):
        res = response.xpath('//*[@id="pronWR"]/text()').extract()
        item = TangspiderframeItem()
        item['url'] = response.url
        item['title'] = response.meta.get("word")
        if res:
            item['content'] = res[0]
        else:
            item["content"] = ""

        return item
コード例 #20
0
    def parse(self, response):
        links = response.xpath(
            '//*[@id="listNewsLG"]/div[1]/div[3]/div[contains(@class, "col-xs-")]/a/@href'
        ).extract()
        for pattern in links:
            link = "http://www.kongthap.gov.la/index1.php" + pattern

            item = TangspiderframeItem()
            item['url'] = link
            # print(item)
            yield item
コード例 #21
0
    def parse_url(self, response):
        # next_links = response.xpath('//a[(contains(@class, "btn-bs-pagination"))]/@href').extract()
        # for next_link in next_links:
        #     yield scrapy.Request(url=next_link, callback=self.parse_url, dont_filter=True)

        links = response.xpath(
            '//h2/a[@class="post-title post-url"]/@href').extract()
        for link in links:
            item = TangspiderframeItem()
            item['url'] = link
            yield item
コード例 #22
0
 def parse(self, response):
     content = re.findall(
         '<td colspan="12" style="padding:15px">(.*?)</td>', response.text,
         re.S)
     new_content = "<td>" + "".join(content) + "</td>"
     html = etree.HTML(new_content)
     res = html.xpath(".//text()")
     item = TangspiderframeItem()
     item['url'] = response.url
     item['content'] = "".join(res)
     return item
コード例 #23
0
 def parse(self, response):
     title = response.xpath(
         '//h1[@class="headline__title"]/text()').extract()
     content = response.xpath('//p/text()').extract()
     catagpory = response.url.split("/")[3]
     item = TangspiderframeItem()
     item['url'] = response.url
     item['title'] = " ".join(title)
     item['content'] = " ".join(content)
     item['category'] = catagpory
     yield item
コード例 #24
0
    def parse_item(self, response):
        next_links = response.xpath('//div[@id="div_currpage"]/a[@class="pagestyle"]/@href').extract()
        for next_link in next_links:
            next_link = "http:" + next_link
            yield scrapy.Request(url=next_link, callback=self.parse_item, dont_filter=True)

        links = response.xpath('//h4/a/@href').extract()
        for link in links:
            url = "http:" + link
            item = TangspiderframeItem()
            item["url"] = url
            yield item
コード例 #25
0
 def parse(self, response):
     resp = json.loads(response.text)
     data = resp.get("data")
     url_lists = data["response"]["videos"]
     for url_list in url_lists:
         url = url_list["play_url"]
         self.youtube_url = url
         self.download(self.youtube_url)
         item = TangspiderframeItem()
         item['url'] = url
         # print(item)
         yield item
コード例 #26
0
 def parse(self, response):
     lis = response.xpath('//div[@class="se_li"]')
     for li in lis:
         sen_en = li.xpath(".//div[@class='sen_en']//text()").extract()
         sentens = "".join(sen_en)
         md = md5(sentens)
         item = TangspiderframeItem()
         item['content'] = sentens
         item['title'] = response.meta.get("keyword")
         item['category'] = ''
         item['fingerprint'] = md
         yield item
コード例 #27
0
    def parse(self, response):
        category = response.xpath('//div[@class="dingtou"]/div[@class="da-bre"]/a[2]/text()').extract()
        title = response.xpath("//h1/text()").extract()  # 文章标题
        contents, para = [], []
        contents.extend(response.xpath('//div[contains(@class, "article")]//p'))
        contents.extend(response.xpath('//div[contains(@class, "zhengwen")]//p'))
        contents.extend(response.xpath('//div[contains(@class, "c_body")]//p'))
        contents.extend(response.xpath('//div[contains(@class, "cnt_bd")]//p'))
        for item in contents:
            para.extend(item.xpath("./text()").extract())
            sub_tag = item.xpath(".//*")
            for sub in sub_tag:
                if sub.root.tag != "script":
                    para.extend(sub.xpath(".//text()").extract())

        item = TangspiderframeItem()
        item['url'] = response.url
        item['category'] = "".join(category).replace(" > ", '').strip()
        item['content'] = "".join([item.strip() for item in para])
        item['title'] = "".join(title).strip()
        yield item
コード例 #28
0
    def parse(self, response):
        pinyin = response.xpath(
            "//td[@class='resultswrap']//tbody/tr/td[@class='details'][4]//text()"
        ).extract()
        item = TangspiderframeItem()
        item['url'] = response.url
        item['title'] = response.meta.get("word")
        if pinyin:
            item['content'] = "|".join(pinyin)
        else:
            item["content"] = ""

        return item
コード例 #29
0
 def parse(self, response):
     zi_a = response.xpath('//p[@class="zi"]/a')
     for zi in zi_a:
         base_url = zi.xpath("./@href").extract()[0]
         word = zi.xpath("./text()").extract()[0]
         pron_span = zi.xpath("./span/text()").extract()
         item = TangspiderframeItem()
         item['url'] = response.url
         item['category'] = base_url
         if pron_span:
             item['title'] = pron_span[0]
         item['content'] = word
         yield item
コード例 #30
0
 def parse(self, response):
     ipa_content = response.xpath(
         """//*[@id="mw-content-text"]//span[@class="ipa"]/text()"""
     ).extract()
     keyword = response.meta.get("keyword")
     show_word = urllib.parse.unquote(response.url).split("/")[-1]
     item = TangspiderframeItem()
     if ipa_content:
         item['url'] = response.url
         item['title'] = keyword
         item["category"] = show_word
         item['content'] = ipa_content[0]
         yield item