コード例 #1
0
from __future__ import unicode_literals, print_function, absolute_import
###########################################################################
#          (C) Vrije Universiteit, Amsterdam (the Netherlands)            #
#                                                                         #
# This file is part of AmCAT - The Amsterdam Content Analysis Toolkit     #
#                                                                         #
# AmCAT is free software: you can redistribute it and/or modify it under  #
# the terms of the GNU Affero General Public License as published by the  #
# Free Software Foundation, either version 3 of the License, or (at your  #
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

from amcat.tools.scraping.processors import PhpBBScraper


class BorstkankerNetScraper(PhpBBScraper):
    index_url = "http://borstkanker.net/forumpatienten/index.php"


if __name__ == '__main__':
    from amcat.tools.scraping.manager import main
    main(BorstkankerNetScraper)
コード例 #2
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, absolute_import
###########################################################################
#          (C) Vrije Universiteit, Amsterdam (the Netherlands)            #
#                                                                         #
# This file is part of AmCAT - The Amsterdam Content Analysis Toolkit     #
#                                                                         #
# AmCAT is free software: you can redistribute it and/or modify it under  #
# the terms of the GNU Affero General Public License as published by the  #
# Free Software Foundation, either version 3 of the License, or (at your  #
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

from amcat.tools.scraping.processors import PhpBBScraper

class KankerpatientNetScraper(PhpBBScraper):
    index_url = "http://www.kankerpatient.net/"

if __name__ == '__main__':
    from amcat.tools.scraping.manager import main
    main(KankerpatientNetScraper)
コード例 #3
0
        @param el: root html element
        @type el: lxml.html object

        @return: generator yielding urls (including non-displayed pages). This method
                 does not return the first page.
        """
        td = el.cssselect('td[class=catbg] td')[0]
        aes = td.cssselect('a')
        if len(aes) > 1:
            last = aes[-2].get('href')

            # Determine wehther this is a board or thread
            ppp = THREADS_PER_BOARD if 'board' in last else MESSAGES_PER_THREAD
            pages = (int(last.split('.')[-1]) / ppp)
            base_url = ".".join(last.split('.')[:-1])

            for page in range(1, pages + 1):
                yield "%s.%s" % (base_url, str(page * ppp))

    def parse_post(self, el):
        pass

    def get(self, page):

        return []


if __name__ == '__main__':
    from amcat.tools.scraping.manager import main
    main(BorstkankerForumScraper)
コード例 #4
0
        # Get page info
        post = page.doc.cssselect('.PhorumStdBlock')[0]
        page.props.author = post.cssselect(
            '.PhorumReadBodyHead strong')[0].text_content().strip()
        page.props.date = toolkit.readDate(
            post.cssselect('.PhorumReadBodyHead')[-1].text)
        page.props.text = post.cssselect('.PhorumReadBodyText')

        yield page

        # Get children
        current = page.doc.cssselect('.PhorumStdTable b')
        for url in self.get_children(page.doc):
            child = page.copy(parent=page)
            child.props.url = url
            child.prepare(self, force=True)

            for p in self.get(child):
                yield p


if __name__ == '__main__':
    from amcat.tools.scraping.manager import main
    main(BorstkankerPrikbordScraper)

    #from amcat.tools.scraping.exporters.builtin import Exporter
    #s = BorstkankerPrikbordScraper(Exporter())
    #doc = s.getdoc("http://borstkanker.startpagina.nl/prikbord/12236848/12341549/re-kalkdeeltjes-gevonden#msg-12341549")
    #print(len(list(s.get_children(doc))))
    #s.quit()
コード例 #5
0
                a = post.cssselect('a')[0]
                href = a.get('href')
                
                yield HTMLDocument(url=a.get('href'),
                                   headline=a.text_content())

            if not doc.cssselect('.post'):
                break


    def get(self, page):
        date = page.doc.cssselect('.meta')[0].text_content().replace('Door borstkankertrefpunt op', ',')

        page.props.date = toolkit.readDate(date.strip())
        page.props.text = page.doc.cssselect('.entry')

        yield page
    
        for comm in page.doc.cssselect('#comments .comment'):
            ca = page.copy(parent=page)
            ca.props.author = comm.cssselect('.comment-meta.commentmetadata .fn')[0].text_content()
            ca.props.url = comm.cssselect('a')[0].get('href')
            ca.props.text = comm.cssselect('.comment-body')

            yield ca


if __name__ == '__main__':
        from amcat.tools.scraping.manager import main
        main(BorstkankerTrefpuntScraper)
コード例 #6
0
    def _scrape_unit(self, page):
        # Get page info
        post = page.doc.cssselect(".PhorumStdBlock")[0]
        page.props.author = post.cssselect(".PhorumReadBodyHead strong")[0].text_content().strip()
        page.props.date = toolkit.readDate(post.cssselect(".PhorumReadBodyHead")[-1].text)
        page.props.text = post.cssselect(".PhorumReadBodyText")

        yield page

        # Get children
        current = page.doc.cssselect(".PhorumStdTable b")
        for url in self.get_children(page.doc):
            child = page.copy(parent=page)
            child.props.url = url
            child.prepare(self, force=True)

            for p in self.get(child):
                yield p


if __name__ == "__main__":
    from amcat.tools.scraping.manager import main

    main(BorstkankerPrikbordScraper)

    # from amcat.tools.scraping.exporters.builtin import Exporter
    # s = BorstkankerPrikbordScraper(Exporter())
    # doc = s.getdoc("http://borstkanker.startpagina.nl/prikbord/12236848/12341549/re-kalkdeeltjes-gevonden#msg-12341549")
    # print(len(list(s.get_children(doc))))
    # s.quit()
コード例 #7
0
                url[3] = urlencode(query)

                yield self.getdoc(urljoin(INDEX_URL, urlunsplit(url)))

    def get(self, thread):
        fipo = True

        for page in self.get_pages(thread.doc):
            for post in page.cssselect('table.forumIndex > tr')[1:]:
                ca = thread if fipo else thread.copy(parent=thread)

                ca.props.date = toolkit.readDate(
                    post.cssselect('span.bijSchrift')[0].text)
                ca.props.author = post.cssselect('td.auteur h2')[0].text

                texttd = post.cssselect('td')[0]
                texttd.cssselect('h2')[0].drop_tree()
                texttd.cssselect('.editImg')[0].drop_tree()
                texttd.cssselect('.bijSchrift')[0].drop_tree()

                ca.props.text = texttd

                yield ca

                fipo = False


if __name__ == '__main__':
    from amcat.tools.scraping.manager import main
    main(BorstkankerNLScraper)
コード例 #8
0
                query['start'] = pag*ppp
                url[3] = urlencode(query)

                yield self.getdoc(urljoin(INDEX_URL, urlunsplit(url)))

    def get(self, thread):
        fipo = True

        for page in self.get_pages(thread.doc):
            for post in page.cssselect('table.forumIndex > tr')[1:]:
                ca = thread if fipo else thread.copy(parent=thread)

                ca.props.date = toolkit.readDate(post.cssselect('span.bijSchrift')[0].text)
                ca.props.author = post.cssselect('td.auteur h2')[0].text

                texttd = post.cssselect('td')[0]
                texttd.cssselect('h2')[0].drop_tree()
                texttd.cssselect('.editImg')[0].drop_tree()
                texttd.cssselect('.bijSchrift')[0].drop_tree()

                ca.props.text = texttd

                yield ca

                fipo = False
        

if __name__ == '__main__':
    from amcat.tools.scraping.manager import main
    main(BorstkankerNLScraper)
コード例 #9
0
###########################################################################
#          (C) Vrije Universiteit, Amsterdam (the Netherlands)            #
#                                                                         #
# This file is part of AmCAT - The Amsterdam Content Analysis Toolkit     #
#                                                                         #
# AmCAT is free software: you can redistribute it and/or modify it under  #
# the terms of the GNU Affero General Public License as published by the  #
# Free Software Foundation, either version 3 of the License, or (at your  #
# option) any later version.                                              #
#                                                                         #
# AmCAT is distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
# You should have received a copy of the GNU Affero General Public        #
# License along with AmCAT.  If not, see <http://www.gnu.org/licenses/>.  #
###########################################################################

from amcat.tools.scraping.processors import PhpBBScraper


class BorstkankerNetScraper(PhpBBScraper):
    index_url = "http://borstkanker.net/forumpatienten/index.php"


if __name__ == "__main__":
    from amcat.tools.scraping.manager import main

    main(BorstkankerNetScraper)
コード例 #10
0
        @param el: root html element
        @type el: lxml.html object

        @return: generator yielding urls (including non-displayed pages). This method
                 does not return the first page.
        """
        td = el.cssselect('td[class=catbg] td')[0] 
        aes = td.cssselect('a')
        if len(aes) > 1:
            last = aes[-2].get('href')

            # Determine wehther this is a board or thread
            ppp = THREADS_PER_BOARD if 'board' in last else MESSAGES_PER_THREAD
            pages = (int(last.split('.')[-1]) / ppp)
            base_url = ".".join(last.split('.')[:-1])

            for page in range(1, pages+1):
                yield "%s.%s" % (base_url, str(page*ppp))

    def parse_post(self, el):
        pass

    def get(self, page):
        

        return []

if __name__ == '__main__':
    from amcat.tools.scraping.manager import main
    main(BorstkankerForumScraper)