def parse_url(self, response):
     links = response.xpath('//a[@class="elemRelative"]/@href').extract()
     for link in links:
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
    def parse(self, response):
        word = response.meta.get("keyword")
        show_word, uk_phonetic, us_phonetic = '', '', ''
        di_title = response.xpath(
            '//div[@class="di-title"]//span[@class="hw dhw"]')
        if di_title:
            show_word = ''.join(di_title[0].xpath("./text()").extract())

        uk_span = response.xpath(
            '//div[@class="pos-header dpos-h"]//span[@class="uk dpron-i "]/span[@class="pron dpron"]'
        )
        if uk_span:
            uk_phonetic = ''.join(uk_span[0].xpath('.//text()').extract())
            if uk_phonetic:
                uk_phonetic = "[" + uk_phonetic.strip('/') + "]"

        us_span = response.xpath(
            '//div[@class="pos-header dpos-h"]//span[@class="us dpron-i "]/span[@class="pron dpron"]'
        )
        if us_span:
            us_phonetic = ''.join(uk_span[0].xpath('.//text()').extract())
            if us_phonetic:
                us_phonetic = "[" + us_phonetic.strip('/') + "]"

        item = SpiderframeItem()
        item['title'] = word  # title  字段 存单词
        item['category'] = show_word  # category 存显示的单词
        item['content'] = uk_phonetic  # content 字段存 英式英语
        item['item_name'] = us_phonetic  # category 字段  美式英语
        yield item
Example #3
0
 def parse(self, response):
     links = response.xpath('//div[@class="box"]//ul/li//a//@href').extract()
     for link in links:
         item = SpiderframeItem()
         item['url'] = "http://www.enet.gr/"+link
         # print(item)
         yield item
 def parse_link(self, response):
     links = response.xpath('//h2/a/@href').extract()
     for link in links:
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
 def parse_url(self, response):
     links = response.xpath('//article/a//@href').extract()
     for link in links:
         if "https://" in link:
             item = SpiderframeItem()
             item['url'] = link
             yield item
    def parse(self, response):
        show_word_r = response.xpath(
            '//div[@class="webtop"]/h1/text()').extract()
        if show_word_r:
            show_word = show_word_r[0]
        else:
            show_word = ''
        ph_en_r = response.xpath(
            '//div[@class="webtop"]/span[@class="phonetics"]/div[@class="phons_br"]/span/text()'
        ).extract()
        if ph_en_r:
            ph_en = ph_en_r[0]
        else:
            ph_en = ''
        ph_am_r = response.xpath(
            '//div[@class="webtop"]/span[@class="phonetics"]/div[@class="phons_n_am"]/span/text()'
        ).extract()
        if ph_am_r:
            ph_am = ph_am_r[0]
        else:
            ph_am = ''

        item = SpiderframeItem()
        item['title'] = response.meta.get("keyword")  # title  字段 存单词
        item['category'] = show_word  # category 存显示的单词
        item['content'] = ph_en  # content 字段存 英式英语
        item['item_name'] = ph_am  # category 字段  美式英语
        return item
Example #7
0
 def parse_url(self, response):
     links = response.xpath('//div[@class="news"]/a/@href').extract()
     for link in links:
         link = "http://www.hurriyetdailynews.com" + link
         item = SpiderframeItem()
         item['url'] = link
         yield item
 def parse_url(self, response):
     links = response.xpath('//a[@class="item"]/@href').extract()
     for link in links:
         link = "https://www.masrawy.com" + link
         item = SpiderframeItem()
         item['url'] = link
         yield item
Example #9
0
    def parse(self, response):
        # sentens1 = response.xpath('//div[@class="layout sort"]//li//text()').extract()
        # sentens2 = response.xpath('//div[@class="layout patt"]//li//text()').extract()
        # sentens3 = response.xpath('//div[@class="layout auth"]//li//text()').extract()
        # sentenses = sentens1 + sentens2 + sentens3
        # for sentens in sentenses:
        #     if (sentens >= u'\u0041' and sentens <= u'\u005a') or (sentens >= u'\u0061' and sentens <= u'\u007a'):
        #         md = md5(sentens)
        #         item = SpiderframeItem()
        #         item['content'] = sentens
        #         item['title'] = response.meta.get("keyword")
        #         item['category'] = 'dict'
        #         item['item_id'] = md
        #         yield item

        # 抓取读音
        word = response.url.split("=")[-1]
        word_tag = response.xpath(
            '//div[@class="word-cont"]/h1/text()').extract()  # 显示单词
        if word_tag:
            phonetic = response.xpath('//div[@class="phonetic"]/span')
            en_phonetic, am_phonetic, phonetic_word = '', '', []
            if phonetic:
                for item in phonetic:
                    pronounce_lang = item.xpath(
                        "./text()").extract()  # 根据标签区分英式和美式
                    if pronounce_lang:
                        pronounce_text = ''.join(pronounce_lang).strip()
                        pronounce_text = pronounce_text.replace(" '",
                                                                '').replace(
                                                                    "’", '')
                        if pronounce_text == "英":
                            en_phonetic = ''.join(
                                item.xpath(
                                    './bdo[@lang="EN-US"]/text()').extract())
                            en_word = ''.join(
                                item.xpath('./i[1]/@naudio').extract())
                            if en_word:
                                phonetic_word.append(en_word.split("=")[-1])
                        elif pronounce_text == "美":
                            am_phonetic = ''.join(
                                item.xpath(
                                    './bdo[@lang="EN-US"]/text()').extract())
                            am_word = ''.join(
                                item.xpath('./i[1]/@naudio').extract())
                            if am_word:
                                phonetic_word.append(am_word.split("=")[-1])
            if phonetic_word:
                phonetic_word = phonetic_word[0]
            else:
                phonetic_word = ''

            item = SpiderframeItem()
            item['title'] = word  # title  字段 存单词
            # item['category'] = word_tag[0]  # category 存显示的单词
            item['category'] = phonetic_word  # category 音标的单词
            item['content'] = en_phonetic  # content 字段存 英式英语
            item['item_name'] = am_phonetic  # category 字段  美式英语
            yield item
Example #10
0
 def parse_link(self, response):
     links = response.xpath(
         '//a[contains(@class, "url track-click")]/@href').extract()
     for link in links:
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
Example #11
0
 def parse_url(self, response):
     links = response.xpath(
         '//a[contains(@class, "dre-item__title")]/@href').extract()
     for link in links:
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
 def parse_url(self, response):
     links = response.xpath('//h3/a/@href').extract()
     for link in links:
         link = "https://www.alittihad.ae" + link
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
Example #13
0
 def parse(self, response):
     links = response.xpath(
         '//a[@class="iframe cboxElement"]/@href').extract()
     for link in links:
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
 def parse(self, response):
     nodes = response.xpath('//*[@id="tmTable"]/div/div[2]/span/span')
     for node in nodes:
         sentence = ''.join(node.xpath(".//span/text()").extract())
         item = SpiderframeItem()
         item['url'] = response.url
         item['content'] = sentence
         yield item
 def parse_url(self, response):
     links = response.xpath('//a[@class="ankeiler__link"]/@href').extract()
     for link in links:
         # print(link)
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
Example #16
0
 def parse_url(self, response):
     links = response.xpath('//a[@class="c-teaser__link"]/@href').extract()
     for link in links:
         link = "https://www.gp.se" + link
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
 def parse_url(self, response):
     links = response.xpath(
         '//div[@class="field field-name-field-webrubrik"]/a/@href'
     ).extract()
     for link in links:
         item = SpiderframeItem()
         item['url'] = "https://www.information.dk" + link
         # print(item)
         yield item
 def parse(self, response):
     patterns = response.xpath(
         '//a[@class="differentTall"]/@href').extract()
     for pattern in patterns:
         link = "http://www.alkhaleej.ae" + pattern
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
Example #19
0
 def parse_url(self, response):
     links = response.xpath('//h3/a/@href').extract()
     for link in links:
         link = urllib.parse.quote(link)
         link = "https://www.tagesanzeiger.ch" + link
         item = SpiderframeItem()
         item['url'] = link
         # print(item)
         yield item
Example #20
0
 def parse_url(self, response):
     links = response.xpath(
         '//div[contains(@class, "art")]//a/@href').extract()
     for link in links:
         if "https://jyllands-posten.dk" in link:
             item = SpiderframeItem()
             item['url'] = link
             # print(item)
             yield item
 def parse_url(self, response):
     page = response.text
     json_content = demjson.decode(page)
     json_contents = json_content["data"]
     json_list = json_contents["item"]
     cdn_url = json_list.get("cdn_url")
     large_mp4 = cdn_url.get("large")
     item = SpiderframeItem()
     item["url"] = large_mp4
     yield item
Example #22
0
 def parse(self, response):
     description = response.xpath(
         '/html/body/section[2]/section[1]/section[1]/p/text()').extract()
     paragraph = response.xpath(
         '/html/body/section[2]/section[1]/section[1]/article/p/text()'
     ).extract()
     content = ' '.join(description + paragraph).replace('\n', '')
     item = SpiderframeItem()
     item['url'] = response.url
     item['content'] = content
     return item
    def parse(self, response):
        title = response.xpath(
            '//h1[contains(@class, "title")]/text()').extract()
        # subtitle = response.xpath('//p[@class="subtitle"]/text()').extract()
        content = response.xpath('//section/p/text()').extract()

        item = SpiderframeItem()
        item['url'] = response.url
        item['category'] = response.url.split('/')[2].split('.')[0]
        item['title'] = ''.join(title)
        item['content'] = ''.join(content)
        yield item
Example #24
0
    def parse(self, response):
        title = response.xpath('//h1/text()').extract()
        content = response.xpath('//p/text()').extract()

        item = SpiderframeItem()
        # item['id'] = 11111111111111
        item['url'] = response.url
        item['category'] = response.url.split('/')[-2]
        item['title'] = ''.join(title)
        item['content'] = ''.join(content)
        # print(item)
        yield item
 def parse_content(self, response):
     resp = demjson.decode(response.text)
     data = resp.get("items", [])
     for v1 in data.values():
         for v2 in v1.values():
             if type(v2) is list and v2:
                 data1=v2[0]
                 if data1.get("target"):
                     expandedUri = data1["target"]["expandedUri"]
                     item = SpiderframeItem()
                     item['url'] = expandedUri
                     yield item
 def parse(self, response):
     title = response.xpath('//h1/text()').extract()
     content = response.xpath('//p//text()').extract()
     content = ''.join(content)
     content = content.replace("\n", "  ")
     content = content.replace("\t", "  ")
     item = SpiderframeItem()
     item['url'] = response.url
     item['category'] = response.url.split('/')[3]
     item['title'] = ''.join(title)
     item['content'] = ''.join(content)
     yield item
Example #27
0
    def parse_link(self, response):
        next_urls = response.xpath('//li[@class="nxtnav "]/a/href')
        for next_url in next_urls:
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_link,
                                 dont_filter=True)

        links = response.xpath('//div[@class="mask-title"]/a/@href').extract()
        for link in links:
            item = SpiderframeItem()
            item['url'] = link
            # print(item)
            yield item
Example #28
0
 def parse(self, response):
     title = response.xpath('//h2[@class="page-title"]/text()').extract()
     content = response.xpath('//p/text()').extract()
     content = ''.join(content)
     content = content.replace("\n", "  ")
     content = content.replace("\t", "  ")
     item = SpiderframeItem()
     item['url'] = response.url
     item['category'] = response.url.split('/')[-2]
     item['title'] = ''.join(title)
     item['content'] = content
     # print(item)
     yield item
Example #29
0
    def parse_item(self, response):
        links = response.xpath(
            "//section[(contains(@class, 'sequence'))]//a/@href").extract()
        for url in links:
            item = SpiderframeItem()
            item['url'] = url
            yield item

        next_url = response.xpath(
            '//section/div/nav[1]/a[@rel="next"]/@href').extract()
        if next_url:
            yield scrapy.Request(url=next_url[0],
                                 callback=self.parse_item,
                                 dont_filter=True)
Example #30
0
 def parse_content(self, response):
     lis = response.xpath('//span//tr/td/table//tr[3]/td/table//tr')
     for li in lis:
         lis_text = li.xpath('.//td[not(@class)]//text()').extract()
         sentens = "".join(lis_text)
         if "未完全匹配句对" not in sentens and sentens != "":
             sentens = sentens.strip()
             md = md5(sentens)
             item = SpiderframeItem()
             item['content'] = sentens
             item['title'] = response.meta.get("keyword")
             item['category'] = 'cnki'
             item['item_id'] = md
             yield item