from __future__ import unicode_literals, print_function, absolute_import ########################################################################### # (C) Vrije Universiteit, Amsterdam (the Netherlands) # # # # This file is part of AmCAT - The Amsterdam Content Analysis Toolkit # # # # AmCAT is free software: you can redistribute it and/or modify it under # # the terms of the GNU Affero General Public License as published by the # # Free Software Foundation, either version 3 of the License, or (at your # # option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### from amcat.tools.scraping.processors import PhpBBScraper class BorstkankerNetScraper(PhpBBScraper): index_url = "http://borstkanker.net/forumpatienten/index.php" if __name__ == '__main__': from amcat.tools.scraping.manager import main main(BorstkankerNetScraper)
# -*- coding: utf-8 -*- from __future__ import unicode_literals, print_function, absolute_import ########################################################################### # (C) Vrije Universiteit, Amsterdam (the Netherlands) # # # # This file is part of AmCAT - The Amsterdam Content Analysis Toolkit # # # # AmCAT is free software: you can redistribute it and/or modify it under # # the terms of the GNU Affero General Public License as published by the # # Free Software Foundation, either version 3 of the License, or (at your # # option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### from amcat.tools.scraping.processors import PhpBBScraper class KankerpatientNetScraper(PhpBBScraper): index_url = "http://www.kankerpatient.net/" if __name__ == '__main__': from amcat.tools.scraping.manager import main main(KankerpatientNetScraper)
@param el: root html element @type el: lxml.html object @return: generator yielding urls (including non-displayed pages). This method does not return the first page. """ td = el.cssselect('td[class=catbg] td')[0] aes = td.cssselect('a') if len(aes) > 1: last = aes[-2].get('href') # Determine wehther this is a board or thread ppp = THREADS_PER_BOARD if 'board' in last else MESSAGES_PER_THREAD pages = (int(last.split('.')[-1]) / ppp) base_url = ".".join(last.split('.')[:-1]) for page in range(1, pages + 1): yield "%s.%s" % (base_url, str(page * ppp)) def parse_post(self, el): pass def get(self, page): return [] if __name__ == '__main__': from amcat.tools.scraping.manager import main main(BorstkankerForumScraper)
# Get page info post = page.doc.cssselect('.PhorumStdBlock')[0] page.props.author = post.cssselect( '.PhorumReadBodyHead strong')[0].text_content().strip() page.props.date = toolkit.readDate( post.cssselect('.PhorumReadBodyHead')[-1].text) page.props.text = post.cssselect('.PhorumReadBodyText') yield page # Get children current = page.doc.cssselect('.PhorumStdTable b') for url in self.get_children(page.doc): child = page.copy(parent=page) child.props.url = url child.prepare(self, force=True) for p in self.get(child): yield p if __name__ == '__main__': from amcat.tools.scraping.manager import main main(BorstkankerPrikbordScraper) #from amcat.tools.scraping.exporters.builtin import Exporter #s = BorstkankerPrikbordScraper(Exporter()) #doc = s.getdoc("http://borstkanker.startpagina.nl/prikbord/12236848/12341549/re-kalkdeeltjes-gevonden#msg-12341549") #print(len(list(s.get_children(doc)))) #s.quit()
a = post.cssselect('a')[0] href = a.get('href') yield HTMLDocument(url=a.get('href'), headline=a.text_content()) if not doc.cssselect('.post'): break def get(self, page): date = page.doc.cssselect('.meta')[0].text_content().replace('Door borstkankertrefpunt op', ',') page.props.date = toolkit.readDate(date.strip()) page.props.text = page.doc.cssselect('.entry') yield page for comm in page.doc.cssselect('#comments .comment'): ca = page.copy(parent=page) ca.props.author = comm.cssselect('.comment-meta.commentmetadata .fn')[0].text_content() ca.props.url = comm.cssselect('a')[0].get('href') ca.props.text = comm.cssselect('.comment-body') yield ca if __name__ == '__main__': from amcat.tools.scraping.manager import main main(BorstkankerTrefpuntScraper)
def _scrape_unit(self, page): # Get page info post = page.doc.cssselect(".PhorumStdBlock")[0] page.props.author = post.cssselect(".PhorumReadBodyHead strong")[0].text_content().strip() page.props.date = toolkit.readDate(post.cssselect(".PhorumReadBodyHead")[-1].text) page.props.text = post.cssselect(".PhorumReadBodyText") yield page # Get children current = page.doc.cssselect(".PhorumStdTable b") for url in self.get_children(page.doc): child = page.copy(parent=page) child.props.url = url child.prepare(self, force=True) for p in self.get(child): yield p if __name__ == "__main__": from amcat.tools.scraping.manager import main main(BorstkankerPrikbordScraper) # from amcat.tools.scraping.exporters.builtin import Exporter # s = BorstkankerPrikbordScraper(Exporter()) # doc = s.getdoc("http://borstkanker.startpagina.nl/prikbord/12236848/12341549/re-kalkdeeltjes-gevonden#msg-12341549") # print(len(list(s.get_children(doc)))) # s.quit()
url[3] = urlencode(query) yield self.getdoc(urljoin(INDEX_URL, urlunsplit(url))) def get(self, thread): fipo = True for page in self.get_pages(thread.doc): for post in page.cssselect('table.forumIndex > tr')[1:]: ca = thread if fipo else thread.copy(parent=thread) ca.props.date = toolkit.readDate( post.cssselect('span.bijSchrift')[0].text) ca.props.author = post.cssselect('td.auteur h2')[0].text texttd = post.cssselect('td')[0] texttd.cssselect('h2')[0].drop_tree() texttd.cssselect('.editImg')[0].drop_tree() texttd.cssselect('.bijSchrift')[0].drop_tree() ca.props.text = texttd yield ca fipo = False if __name__ == '__main__': from amcat.tools.scraping.manager import main main(BorstkankerNLScraper)
query['start'] = pag*ppp url[3] = urlencode(query) yield self.getdoc(urljoin(INDEX_URL, urlunsplit(url))) def get(self, thread): fipo = True for page in self.get_pages(thread.doc): for post in page.cssselect('table.forumIndex > tr')[1:]: ca = thread if fipo else thread.copy(parent=thread) ca.props.date = toolkit.readDate(post.cssselect('span.bijSchrift')[0].text) ca.props.author = post.cssselect('td.auteur h2')[0].text texttd = post.cssselect('td')[0] texttd.cssselect('h2')[0].drop_tree() texttd.cssselect('.editImg')[0].drop_tree() texttd.cssselect('.bijSchrift')[0].drop_tree() ca.props.text = texttd yield ca fipo = False if __name__ == '__main__': from amcat.tools.scraping.manager import main main(BorstkankerNLScraper)
########################################################################### # (C) Vrije Universiteit, Amsterdam (the Netherlands) # # # # This file is part of AmCAT - The Amsterdam Content Analysis Toolkit # # # # AmCAT is free software: you can redistribute it and/or modify it under # # the terms of the GNU Affero General Public License as published by the # # Free Software Foundation, either version 3 of the License, or (at your # # option) any later version. # # # # AmCAT is distributed in the hope that it will be useful, but WITHOUT # # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public # # License for more details. # # # # You should have received a copy of the GNU Affero General Public # # License along with AmCAT. If not, see <http://www.gnu.org/licenses/>. # ########################################################################### from amcat.tools.scraping.processors import PhpBBScraper class BorstkankerNetScraper(PhpBBScraper): index_url = "http://borstkanker.net/forumpatienten/index.php" if __name__ == "__main__": from amcat.tools.scraping.manager import main main(BorstkankerNetScraper)
@param el: root html element @type el: lxml.html object @return: generator yielding urls (including non-displayed pages). This method does not return the first page. """ td = el.cssselect('td[class=catbg] td')[0] aes = td.cssselect('a') if len(aes) > 1: last = aes[-2].get('href') # Determine wehther this is a board or thread ppp = THREADS_PER_BOARD if 'board' in last else MESSAGES_PER_THREAD pages = (int(last.split('.')[-1]) / ppp) base_url = ".".join(last.split('.')[:-1]) for page in range(1, pages+1): yield "%s.%s" % (base_url, str(page*ppp)) def parse_post(self, el): pass def get(self, page): return [] if __name__ == '__main__': from amcat.tools.scraping.manager import main main(BorstkankerForumScraper)