Esempio n. 1
0
    def parse_standard_rss(self, url):
        headers = self.set_headers()

        req = urllib.request.Request(url, headers=headers)
        parse_xml_url = urllib.request.urlopen(req)

        xml_page = parse_xml_url.read()
        parse_xml_url.close()

        soup_page = BeautifulSoup(xml_page, "lxml")
        channel = soup_page.find("channel")
        news_list = channel.findAll("item")

        links = []
        for getfeed in news_list:
            titolo = getfeed.title.text

            description = ""
            if getfeed.description.text:
                description = getfeed.description.text

            link_id = generate_link_id(titolo)

            links.append({
                'id': link_id,
                'titolo': titolo,
                'text': description,
                'url': getfeed.link.nextSibling.rstrip(),
                'data': parse_date(getfeed.pubdate.text)
            })

            if JUST_ONE_LINK:
                break

        return links
Esempio n. 2
0
    def parse_page(self, url):
        links = []

        headers = super().set_headers()
        results = requests.get(url.strip(), headers=headers)

        soup = BeautifulSoup(results.text, "html.parser")
        container = soup.find('ul', class_='collection-page__list')

        articles = container.find_all('article')

        if articles:
            for article in articles:
                if article.find('h2'):
                    title = article.find('h2').find('a').contents[0].text
                    href = article.find('h2').find('a')['href']
                    data = article.find('p', class_="article-teaser-vertical__date").text.strip()

                    # print(title)
                    # print (href)
                    # print (data)


                    links.append({
                        'id': generate_link_id(title),
                        'titolo': title,
                        'text': '',
                        'url': "https://www.greenbiz.com" + href,
                        'data': data
                    })

        return links
Esempio n. 3
0
    def parse_page(self, url):
        links = []

        headers = super().set_headers()
        results = requests.get(url.strip(), headers=headers)

        soup = BeautifulSoup(results.text, "html.parser")
        container = soup.find('div', id='topics')
        articles = container.find_all('article', class_='item')

        if articles:
            for article in articles:
                if article.find('h1'):
                    title = article.find('h1').find('a').contents[0]
                    href = article.find('h1').find('a')['href']
                    data = article.find('div', class_="date").text.strip()
                    links.append({
                        'id': generate_link_id(title),
                        'titolo': title,
                        'text': '',
                        'url': href,
                        'data': data
                    })

        return links
Esempio n. 4
0
    def get_items(self):
        super().get_items()

        today = get_today_date()

        headers = super().set_headers()
        results = requests.get(self.url.strip(), headers=headers)

        soup = BeautifulSoup(results.text, "html.parser")
        container = soup.find('article', class_='main-feature')
        title = container.find('h2')

        links = [{
            'id': generate_link_id(title.text),
            'titolo': title.text,
            'text': '',
            'url': title.find('a').get('href'),
            'data': today
        }]

        def get_links_from_structure(articles):
            links = []
            rows = articles.find_all('div', class_='row')

            for row in rows:
                a = row.find('a')

                if a:
                    h3 = a.find('h3')

                    if h3:

                        links.append({
                            'id': generate_link_id(h3.text),
                            'titolo': h3.text,
                            'text': '',
                            'url': a.get('href'),
                            'data': today
                        })

            return links

        container = soup.find('div', class_='cards')
        links = get_links_from_structure(container)

        self.links = links
Esempio n. 5
0
        def get_links_from_structure(articles):
            links = []
            rows = articles.find_all('div', class_='row')

            for row in rows:
                a = row.find('a')

                if a:
                    h3 = a.find('h3')

                    if h3:

                        links.append({
                            'id': generate_link_id(h3.text),
                            'titolo': h3.text,
                            'text': '',
                            'url': a.get('href'),
                            'data': today
                        })

            return links
Esempio n. 6
0
    def parse_page(self, url):
        links = []

        base_url = self.params['base_url']
        article_selector = self.params['article_selector']
        title_selector = self.params['title_selector']
        link_selector = self.params['link_selector']

        if 'date_selector' in self.params:
            date_selector = self.params['date_selector']
        else:
            date_selector = None

        try:
            headers = super().set_headers()
            results = requests.get(url.strip(), headers=headers)
            if TEST:
                print(results.text)

            soup = BeautifulSoup(results.text, "html.parser")
            articles = soup.select(article_selector)
            if TEST:
                print(article_selector)
                print(articles)

            if articles:
                print("Getting links")

                if TEST:
                    print("Len: %d" % len(articles))

                for article in articles:

                    try:
                        title = article.select(title_selector)[0].text.strip()
                        if TEST:
                            print("---------------------------------")
                            print(title)

                        link = article.select(link_selector)[0]['href']
                        if not link.startswith("http"):
                            link = base_url + link

                        if TEST:
                            print("---------------------------------")
                            print(link)

                        date = ''
                        try:
                            if date_selector:
                                date = article.select(
                                    date_selector)[0].text.strip()
                        except Exception as e1:
                            print(
                                "Exception in parse_page > for loop > get date"
                            )

                        if TEST:
                            print(date)

                        links.append({
                            'id': generate_link_id(title),
                            'titolo': title,
                            'text': '',
                            'url': link,
                            'data': date
                        })

                        if TEST:
                            break
                    except Exception as e:
                        # TODO write exception to log for analysis
                        print("Exception in parse_page > for loop")
                        print(e)

            self.delay()

        except Exception as e:
            # TODO write exception to log for analysis
            print("Exception in parse_page")
            print(e)
            links = []

        return links
Esempio n. 7
0
def parse_page(url, params):
    links = []

    logger.info("Parsing url: " + url)

    base_url = params['base_url']
    article_selector = params['article_selector']
    title_selector = params['title_selector']
    link_selector = params['link_selector']

    if 'date_selector' in params:
        date_selector = params['date_selector']
    else:
        date_selector = None

    try:
        headers = set_headers()
        results = requests.get(url.strip(), headers=headers)

        soup = BeautifulSoup(results.text, "html.parser")
        articles = soup.select(article_selector)

        if articles:
            if DEBUG:
                logger.debug(str("Number of links in url: %d" % len(articles)))

            for article in articles:

                try:
                    title = article.select(title_selector)[-1].text.strip()

                    if DEBUG:
                        logger.debug("Title: " + title)

                    link = article.select(link_selector)[-1]['href']
                    if not link.startswith("http"):
                        link = base_url + link

                    if DEBUG:
                        logger.debug("Link: " + link)

                    date = ''
                    try:
                        if date_selector:
                            d = article.select(date_selector)[-1].text.strip()
                            date = parse_date(d)
                    except Exception as e0:
                        logger.err(
                            str("Exception in parse_page > for loop > get date: %s"
                                % e0))

                    if DEBUG:
                        logger.debug("Date: " + str(date))

                    links.append({
                        'id': generate_link_id(title),
                        'titolo': title,
                        'text': '',
                        'url': link,
                        'data': date
                    })

                    if TEST:
                        break

                except Exception as e:
                    logger.err(
                        str("Exception in parse_page > for loop: %s" % e))

        delay()

    except Exception as e:
        logger.err(str("Exception in parse_page: %s" % e))
        links = []

    return links