コード例 #1
0
def extract_evolve_info(sel: parsel.Selector):
    paragraph = html.remove_tags(sel.get()).strip()
    phrases = paragraph.split('.')

    for phrase in phrases:
        if 'evolve' in phrase:
            return phrase.strip()

    return None
コード例 #2
0
ファイル: crawler.py プロジェクト: claytonsands/infoglobo
    def parse(self, response):
        client = MongoClient()
        db = client.crawler
        tag_description = []
        feed = []
        for x in range(0, len(response.xpath("//item"))):
            title = response.xpath("//item/title/text()")[x].get()
            link = response.xpath("//item/link/text()")[x].get()
            html = Selector(response.xpath("//item/description/text()")[x].get())
            tag_a_desc = html.xpath("//div/ul/li/a/@href").getall()

            soup = BeautifulSoup(html.get(), "html.parser")
            for tag in soup.find_all(['img', 'p']):
                if tag.name == 'img':
                    tag_description.append({
                        'type': 'image',
                        'content': tag.get('src')
                    })
                else:
                    tag_description.append({
                        'type': 'text',
                        'content': tag.text
                    })
            tag_description.append({
                'type': 'links',
                'content': tag_a_desc
            })
            feed.append({
                    'item': {
                        'title': title,
                        'link': link,
                        'description': tag_description
                    }
                })
        result = {'feed': feed}
        db.feed.insert(result)
        return result

#scrapy shell https://revistaautoesporte.globo.com/rss/ultimas/feed.xml
#scrapy runspider crawler\spiders\infoglobo.py -o infoglobo.jsonlines
コード例 #3
0
def extract_effect(sel: parsel.Selector):
    s = html.remove_tags(sel.get()).strip()

    return re_multiple_spaces.sub(' ', s)
コード例 #4
0
ファイル: parser.py プロジェクト: cement-hools/poems_CBV
def get_title(src_title: Selector, text: str) -> str:
    title: str = src_title.get()
    if title == '* * *':
        title = text.split('\n')[0]
    return title