Beispiel #1
0
 def _scrape_unit(self, article_id):
     article = HTMLDocument(url = self.article_url.format(**locals()))
     article.prepare(self)
     article.props.text = article.doc.cssselect("font.artbody")
     if len("".join([t.text_content() for t in article.props.text])) < 100:
         return
     for i, table in enumerate(article.doc.cssselect("table")):
         if table.get('class') == "body":
             table_after_body = article.doc.cssselect("table")[i + 1]
     page_date = re.search(
         "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})",
         table_after_body.text_content())
     article.props.pagenr = page_date.group(1)
     article.props.date = readDate(page_date.group(2))
     article.props.section = self.current_section
     article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip()
     if article.doc.cssselect(".artsubheader"):
         article.props.byline = article.doc.cssselect(".artsubheader")[0]
     if article.doc.cssselect("td.artauthor"):
         article.props.author = article.doc.cssselect("td.artauthor")[0].text.split(":")[1].strip()
     dateline_match = re.search(
         "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n",
         "\n".join([n.text_content() for n in article.props.text]).strip())
     if dateline_match:
         article.props.dateline = dateline_match.group(1)
                                       
     yield article
Beispiel #2
0
    def _get_units(self):
        for section in self.sections:
            page = 1
            url = self.page_url.format(**locals())

            date = _date.today()
            ipage = self.getdoc(url)
            while date >= self.options['date']:
                if not ipage.cssselect("#main ul.snelnieuws_list li.item"):
                    print("\nNo articles found as far back as given date\n")
                    break
                for unit in ipage.cssselect(
                        '#main ul.snelnieuws_list li.item'):
                    href = unit.cssselect('a')[0].get('href')
                    article = HTMLDocument(url=href)
                    article.prepare(self)
                    try:
                        date = readDate(
                            article.doc.cssselect("span.datum")
                            [0].text).date()
                    except IndexError:
                        continue
                    if date == self.options['date']:
                        yield article
                    elif date < self.options['date']:
                        break

                page += 1
                nxt_url = self.page_url.format(**locals())
                ipage = self.getdoc(nxt_url)
Beispiel #3
0
    def _get_units(self):
        for section in self.sections:
            page = 1
            url = self.page_url.format(**locals())

            date = _date.today()
            ipage = self.getdoc(url)
            while date >= self.options['date']:
                if not ipage.cssselect("#main ul.snelnieuws_list li.item"):
                    print("\nNo articles found as far back as given date\n")
                    break
                for unit in ipage.cssselect('#main ul.snelnieuws_list li.item'):
                    href = unit.cssselect('a')[0].get('href')
                    article = HTMLDocument(url=href)
                    article.prepare(self)
                    try:
                        date = readDate(article.doc.cssselect("span.datum")[0].text).date()
                    except IndexError:
                        continue
                    if date == self.options['date']: 
                        yield article
                    elif date < self.options['date']:
                        break 

                page += 1
                nxt_url = self.page_url.format(**locals())
                ipage = self.getdoc(nxt_url)
Beispiel #4
0
    def _scrape_unit(self, li):
        a = li.cssselect("li > a")[0]
        article = HTMLDocument(url=urljoin(self.index_url, a.get('href')))
        article.props.headline = a.text
        article.props.kicker = li.cssselect("div.infoboard a.kicker")[0].text
        article.props.intro = li.cssselect("p")
        article.props.date = readDate(
            li.cssselect("div.infoboard span.time")[0].text_content())
        article.prepare(self)
        articletime = article.doc.cssselect("p.articletime")[0].text_content()
        if len(articletime.split("|")) > 2:
            article.props.date = readDate(" ".join(
                articletime.split("|")[:-1]))
            article.props.author = articletime.split("|")[-1]
        else:
            article.props.author = articletime.strip()
            if " Korrespondent" in article.props.author:
                article.props.author = article.props.author.split(
                    "Korrespondent")[1].strip()

        for ad in article.doc.cssselect("div.noprint"):
            ad.drop_tree()
        article.props.text = article.doc.cssselect(
            "p.articlelead, #articletext")
        article.props.section = article.doc.cssselect(
            "div.headtop span.sitetop")[0].text_content()
        yield article
    def get_comments(self, page):
        for li in page.doc.cssselect("ul.commentlist li.comment"):
            comment = HTMLDocument()
            comment.parent = page
            try:
                dateauthor = li.cssselect("div.commentsbox")[0].text_content()
            except IndexError:
                comment.props.author = li.text_content().split(":")[0]

                comment.props.date = readDate(":".join(li.text_content().split(":")[1:2]))
                try:
                    comment.props.text = li.cssselect("div.comment-text-reply")[0]
                except UnicodeDecodeError:
                    continue
            else:
                comment.props.author = dateauthor.split("Geplaatst door")[1].split(" op ")[0]
                try:
                    li.cssselect("div.commentsbox a")[0].drop_tree()
                except:
                    pass
                comment.props.date = readDate(dateauthor.split(" op ")[1])
                try:
                    comment.props.text = li.cssselect("div.comment-text")[0]
                except UnicodeDecodeError:
                    continue
            yield comment
    def _scrape_unit(self, url):
        article = HTMLDocument(url=url)
        article.prepare(self)

        content = article.doc.cssselect("#content-column")[0]
        article.props.date = readDate(content.cssselect("p.article-meta")[0].text.split("|")[1])
        article.props.headline = content.cssselect("h1")[0].text
        
        for x in [
            content.cssselect("h1")[0],
            content.cssselect("p.article-meta")[0],
            content.cssselect("p.sharing")[0]
        ]:
            x.drop_tree()

        article.props.text = content.text_content()

        for block in article.doc.cssselect("#aside-column div.block"):
            title = block.cssselect("h2")[0].text
            if "Verantwoordelijk" in title and "ministerie" in title:
                article.props.author = "; ".join([a.text for a in block.cssselect("ul.list-common li a")])
                break
        
        try:
            if len(article.props.author) > 100:
                article.props.author = article.props.author[:100]
        except AttributeError:
            pass
        yield article
 def _scrape_unit(self, url): 
     page = HTMLDocument(url=url)
     page.prepare(self)
     for comment in self.get_comments(page):
         yield comment
         comment.is_comment = True
     yield self.get_article(page)
Beispiel #8
0
    def _scrape_unit(self, article_id):
        article = HTMLDocument(url=self.article_url.format(**locals()))
        article.prepare(self)
        article.props.text = article.doc.cssselect("font.artbody")
        if len("".join([t.text_content() for t in article.props.text])) < 100:
            return
        for i, table in enumerate(article.doc.cssselect("table")):
            if table.get('class') == "body":
                table_after_body = article.doc.cssselect("table")[i + 1]
        page_date = re.search(
            "Pagina ([0-9]+), ([0-9]{2}\-[0-9]{2}\-[0-9]{4})",
            table_after_body.text_content())
        article.props.pagenr = page_date.group(1)
        article.props.date = readDate(page_date.group(2))
        article.props.section = self.current_section
        article.props.headline = article.doc.cssselect(
            "td.artheader")[0].text_content().strip()
        if article.doc.cssselect(".artsubheader"):
            article.props.byline = article.doc.cssselect(".artsubheader")[0]
        if article.doc.cssselect("td.artauthor"):
            article.props.author = article.doc.cssselect(
                "td.artauthor")[0].text.split(":")[1].strip()
        dateline_match = re.search(
            "^([A-Z][a-z]+(( |/)[A-Z][a-z]+)?)\n",
            "\n".join([n.text_content() for n in article.props.text]).strip())
        if dateline_match:
            article.props.dateline = dateline_match.group(1)

        yield article
Beispiel #9
0
 def _scrape_unit(self, url):
     page = HTMLDocument(url=url)
     page.prepare(self)
     for comment in self.get_comments(page):
         yield comment
         comment.is_comment = True
     yield self.get_article(page)
Beispiel #10
0
 def _scrape_unit(self, pagenr_url):
     pagenumber, url = pagenr_url
     page = HTMLDocument(date=self.options.get("date"), url=url, pagenumber=pagenumber)
     page.prepare(self)
     article = self._get_article(page)
     if article:
         yield article
Beispiel #11
0
    def get_comments(self, page):
        for li in page.doc.cssselect("ul.commentlist li.comment"):
            comment = HTMLDocument()
            comment.parent = page
            try:
                dateauthor = li.cssselect("div.commentsbox")[0].text_content()
            except IndexError:
                comment.props.author = li.text_content().split(":")[0]

                comment.props.date = readDate(":".join(
                    li.text_content().split(":")[1:2]))
                try:
                    comment.props.text = li.cssselect(
                        "div.comment-text-reply")[0]
                except UnicodeDecodeError:
                    continue
            else:
                comment.props.author = dateauthor.split(
                    "Geplaatst door")[1].split(" op ")[0]
                try:
                    li.cssselect("div.commentsbox a")[0].drop_tree()
                except:
                    pass
                comment.props.date = readDate(dateauthor.split(" op ")[1])
                try:
                    comment.props.text = li.cssselect("div.comment-text")[0]
                except UnicodeDecodeError:
                    continue
            yield comment
    def scrape_media(self,doc,_type):
        scrn = HTMLDocument()
        scrn.doc = doc
        try:
            scrn.props.text = scrn.doc.cssselect("div.mediaDescription")[0]
        except IndexError:
            scrn.props.text = "none"

        try:
            scrn.props.headline = "{} {}".format(scrn.doc.cssselect("div.screenshotAppName")[0].text,_type)
        except IndexError:
            scrn.props.headline = "unknown"

        author_url = "/".join(scrn.doc.cssselect("div.linkAuthor a")[0].get('href').split("/")[:-2])
        scrn = self.get_author_props(scrn, author_url)

        for obj in scrn.doc.cssselect("div.rightDetailsBlock div.detailsStatRight"):
            try:
                scrn.props.date = readDate(obj.text)
            except ValueError:
                continue
            else:
                break

        if not scrn.doc.cssselect("div.commentthread_paging"):
            yield scrn;return
        if not scrn.doc.cssselect("div.commentthread_header div.commentthread_paging span")[1].text_content():
            for comment in self.scrape_comments(scrn):
                yield comment
        else:
            raise NotImplementedError

        yield scrn
Beispiel #13
0
 def get_comment(self, page, header,table):
     comment = HTMLDocument()
     comment.parent = page
     comment.props.date = readDate(header.cssselect("span.kmsgdate")[0].get('title'))
     comment.props.headline = header.cssselect("h2 span")[0].text_content()
     comment.props.author = table.cssselect("li.kpost-username")[0].text_content()
     comment.props.text = table.cssselect("div.kmsgtext")[0]
     return comment
 def get_comment(self, page, header, table):
     comment = HTMLDocument()
     comment.parent = page
     comment.props.date = readDate(header.cssselect("span.kmsgdate")[0].get("title"))
     comment.props.headline = header.cssselect("h2 span")[0].text_content()
     comment.props.author = table.cssselect("li.kpost-username")[0].text_content()
     comment.props.text = table.cssselect("div.kmsgtext")[0]
     return comment
 def get_comments(self,page):
     for div in page.doc.cssselect("div.comment"):
         comment = HTMLDocument()
         comment.props.text = div.cssselect("div.reactie")[0]
         comment.props.author = div.cssselect("li.naam")[0].text_content()
         comment.props.date = readDate(div.cssselect("li.date")[0].text_content())
         comment.parent = page
         yield comment
Beispiel #16
0
 def get_comments(self, page):
     for li in page.doc.cssselect("#detail_reactions #reaction ul.clear li"):
         comment = HTMLDocument()
         comment.props.author = li.cssselect("cite")[0].text.strip()
         comment.props.text = li.cssselect("blockquote")[0]
         comment.props.date = readDate(li.cssselect("span.time")[0].text)
         comment.parent = page
         yield comment
Beispiel #17
0
 def get_comments(self, page):
     for li in page.doc.cssselect("#detail_reactions #reaction ul.clear li"):
         comment = HTMLDocument()
         comment.props.author = li.cssselect("cite")[0].text.strip()
         comment.props.text = li.cssselect("blockquote")[0]
         comment.props.date = readDate(li.cssselect("span.time")[0].text)
         comment.parent = page
         yield comment
Beispiel #18
0
 def _get_units(self):
     for url, doc in self.getdocs():
         date = readDate(doc.cssselect("section.headlinedivider p.lfloat")[0].text_content().strip().split("am")[1])
         print(date)
         if date.date() != self.options['date']:
             continue
         article = HTMLDocument(url = url, date = date)
         article.doc = doc
         yield article
Beispiel #19
0
 def get_comments(self, page):
     for div in page.doc.cssselect("div.comment"):
         comment = HTMLDocument()
         comment.props.text = div.cssselect("div.reactie")[0]
         comment.props.author = div.cssselect("li.naam")[0].text_content()
         comment.props.date = readDate(
             div.cssselect("li.date")[0].text_content())
         comment.parent = page
         yield comment
 def _scrape_unit(self, url):
     article = HTMLDocument(url = url)
     article.prepare(self)
     article.props.date = self.options['date']
     article.props.section = " > ".join([
             a.text for a in article.doc.cssselect("#SubHeaderBreadcrumbs a")])
     article.props.headline = article.doc.cssselect("title")[0].text.split("—")[0].strip()
     article.props.text = article.doc.cssselect(
         "#parent-fieldname-description") + article.doc.cssselect("#parent-fieldname-text")
     yield article
Beispiel #21
0
    def _scrape_unit(self, ipage):
        page = ipage
        ipage = HTMLDocument(ipage)
        ipage.doc = self.open(page['url'])

        text = wegenertools.clean(ipage.doc.read())
        err_text = "Uw account is niet geregistreerd voor de door u gekozen uitgave."
        if err_text in text:
            raise Exception(err_text)
        for article_ids in wegenertools.get_article_ids(text):
            body, headline, byline = wegenertools.get_article(
                text, article_ids)
            if len(
                    body
            ) >= 300:  #filtering non-articles, image links and other html crap
                artpage = HTMLDocument()
                stop = False
                for part in body.split("\n\n"):
                    if part.isupper():
                        pass
                    else:
                        if "\n" in part:
                            #when title has a linebreak it's probably not an article
                            stop = True
                            break
                        else:

                            artpage.props.headline = part
                            break
                if stop:
                    break
                else:

                    p = re.compile("[\\\]udc[\w\w]")
                    artpage.props.text = literal_eval(p.sub("", repr(body)))
                    artpage.props.edition = page['edition']
                    artpage.props.byline = byline
                    artpage.props.section = page['section']
                    if re.match("[A-Z][0-9]+", page['page_str']):
                        artpage.props.section += " - section " + page[
                            'page_str'][0]
                        artpage.props.pagenr = int(page['page_str'][1:])
                    else:
                        artpage.props.pagenr = int(page['page_str'])

                    dateline_pattern = re.compile(
                        "(^[^\n]+\n\n([A-Z]+( [A-Z]+)?) -\n)|(([A-Z]+( [A-Z]+)?)\n\n)"
                    )
                    match = dateline_pattern.search(artpage.props.text)
                    if match:
                        #dateline and theme have the same syntax and are therefore undistinguishable
                        artpage.props.dateline_or_theme = match.group(
                            2) or match.group(5)
                    artpage.props.url = page['url']
                    yield artpage
Beispiel #22
0
 def _scrape_unit(self, url):
     article = HTMLDocument(url = url)
     article.prepare(self.scraper)
     article.props.headline = article.doc.cssselect("#articleDetailTitle")[0].text_content()
     time_post = article.doc.cssselect("div.time_post")[0]
     if time_post.cssselect("span.author"):
         article.props.author = time_post.cssselect("span.author")[0].text_content().lstrip("Dor:")
         time_post.cssselect("span.author")[0].drop_tree()
     article.props.date = readDate(time_post.text_content())
     article.props.text = article.doc.cssselect("#art_box2")[0]
     yield article
Beispiel #23
0
 def _scrape_unit(self, url):
     article = HTMLDocument(url = url)
     article.prepare(self.scraper)
     article.props.date = readDate(article.doc.cssselect("div.dateplace-data")[0].text)
     article.props.headline = article.doc.cssselect("h1")[0].text_content().strip()
     [s.drop_tree() for s in article.doc.cssselect("script")]
     article.props.text = article.doc.cssselect("#leadarticle div.content")[0]
     author = article.doc.cssselect("#leadarticle span.smallprint")
     if author:
         article.props.author = author[0].text.strip("| ")
     yield article
Beispiel #24
0
 def _scrape_unit(self, url):
     article = HTMLDocument(url = url)
     article.prepare(self.scraper)
     article.props.date = readDate(article.doc.cssselect("#artikel span.datum,#artikel span.datum-premium-content")[0].text_content())
     article.props.headline = article.doc.cssselect("#artikel h1")[0].text_content()
     author = article.doc.cssselect("#artikel span.auteur")
     if author:
         article.props.author = author[0].text_content()
     [s.drop_tree() for s in article.doc.cssselect("#artikelKolom script")]
     article.props.text = article.doc.cssselect("#artikelKolom,#artikel div.zak-txt-premium-content")[0]
     yield article
Beispiel #25
0
    def _scrape_unit(self, url):
        doc = self.getdoc(url)
        for a in doc.cssselect('#Articles a'):
            page = HTMLDocument(date=self.options.get('date'))
            page.coords = stoolkit.parse_coords(doc.cssselect('div.%s' % a.get('class')))
            page.props.url = urljoin(url, '%s_text.html' % a.get('class'))

            page.prepare(self)
            article = self.get_article(page)
            if article:
                yield article
 def get_article(self, url, datetime):
     page = HTMLDocument(url = url)
     page.prepare(self)
     page.props.headline = page.doc.cssselect("div.article h1")[0]
     page.props.text = page.doc.cssselect("#broodtekst")[0]
     page.props.date = datetime
     if page.doc.cssselect("div.auteursinfo"):
         page.props.author = page.doc.cssselect("div.auteursinfo h2")[0].text_content()
     page.props.section = url.split("/")[3]
     page.props.html = html.tostring(page.doc)
     return page
Beispiel #27
0
 def _scrape_unit(self, url):
     article = HTMLDocument(url = url)
     article.prepare(self)
     article.props.date = readDate(article.doc.cssselect("#artikel span.datum")[0].text_content())
     article.props.headline = article.doc.cssselect("#artikel h1")[0].text_content()
     author = article.doc.cssselect("#artikel span.auteur")
     if author:
         article.props.author = author[0].text_content()
     [s.drop_tree() for s in article.doc.cssselect("#artikelKolom script")]
     article.props.text = article.doc.cssselect("#artikelKolom")[0]
     yield article
    def _scrape_unit(self, data): 
        headline, article_date, pagenr, section, url = data
        art = HTMLDocument(
            headline = headline, date = article_date, 
            pagenr = pagenr, section = section, url = url)
        art.doc = self.open(url).read()

        text = self.pdf_to_text(art.doc).decode('utf-8')
        art.props.text = self.fix_text(text)
        art.props.source = "dekrantvantoen.nl"
        yield art
Beispiel #29
0
 def _scrape_unit(self, url):
     article = HTMLDocument(url = url)
     article.prepare(self)
     article.props.headline = article.doc.cssselect("#articleDetailTitle")[0].text_content()
     time_post = article.doc.cssselect("div.time_post")[0]
     if time_post.cssselect("span.author"):
         article.props.author = time_post.cssselect("span.author")[0].text_content().lstrip("Dor:")
         time_post.cssselect("span.author")[0].drop_tree()
     article.props.date = readDate(time_post.text_content())
     article.props.text = article.doc.cssselect("#art_box2")[0]
     yield article
Beispiel #30
0
 def _scrape_unit(self, url):
     article = HTMLDocument(url = url)
     article.prepare(self)
     article.props.date = readDate(article.doc.cssselect("div.dateplace-data")[0].text)
     article.props.headline = article.doc.cssselect("h1")[0].text_content().strip()
     [s.drop_tree() for s in article.doc.cssselect("script")]
     article.props.text = article.doc.cssselect("#leadarticle div.content")[0]
     author = article.doc.cssselect("#leadarticle span.smallprint")
     if author:
         article.props.author = author[0].text.strip("| ")
     yield article
Beispiel #31
0
 def _scrape_unit(self, url):
     page = HTMLDocument(url = url)
     page.prepare(self)
     page.props.headline = page.doc.cssselect("#art_box2 h1")[0].text_content()
     for h1 in page.doc.cssselect("h1"):
         h1.drop_tree()
     page.props.author = self.getauthor(page.doc)
     page.props.text = page.doc.cssselect("#art_box2 p")
     page.props.date = readDate(page.doc.cssselect("div.time_post")[0].text.split("Bron:")[0])
     page.props.section = re.search("parool/nl/[0-9]+/([\w\d\-]+)/article", page.props.url).group(1).capitalize()
     page.props.html = html.tostring(page.doc)
     yield page
Beispiel #32
0
 def get_article(self, url):
     url = "{}_body.html".format(url[:-5])
     pagenum = url.split("/")[7][0:5]
     article = HTMLDocument(url = url, pagenr = int(pagenum))
     article.doc = self.getdoc(url)
     article.props.headline = article.doc.cssselect("td.artheader")[0].text_content().strip()
     article.props.text = article.doc.cssselect("table.body")[0]
     if article.doc.cssselect("td.artauthor"):
         article.props.author = article.doc.cssselect("td.artauthor")[0].text_content().lstrip("dor")
     article.props.date = self.options['date']
     article.props.section = self.section
     return article
Beispiel #33
0
    def get_article(self, url, datetime):
        page = HTMLDocument(url=url)
        page.prepare(self)
        page.props.headline = page.doc.cssselect("div.article h1")[0]
        page.props.text = page.doc.cssselect("#broodtekst")[0]
        page.props.date = datetime
        if page.doc.cssselect("div.auteursinfo"):
            page.props.author = page.doc.cssselect(
                "div.auteursinfo h2")[0].text_content()
        page.props.section = url.split("/")[3]

        return page
Beispiel #34
0
 def _scrape_unit(self, tr):
     """gets articles from a page"""
     url = urljoin("http://forum.fok.nl",tr.cssselect("td.tTitel a")[0].get('href'))
     topic = HTMLDocument(url=url,
                          section=tr.cssselect("td.tFolder")[0].text_content())
     
     topic.prepare(self)
     content = topic.doc.text_content()
     if any([(s in content) for s in SEARCHTERMS]):
         for comment in self.get_comments(topic):
             comment.is_comment = True
             yield comment
         yield self.get_article(topic)
    def _scrape_unit(self, data):
        headline, article_date, pagenr, section, url = data
        art = HTMLDocument(headline=headline,
                           date=article_date,
                           pagenr=pagenr,
                           section=section,
                           url=url)
        art.doc = self.open(url).read()

        text = self.pdf_to_text(art.doc).decode('utf-8')
        art.props.text = self.fix_text(text)
        art.props.source = "dekrantvantoen.nl"
        yield art
 def _scrape_unit(self, url):
     article = HTMLDocument(url = url)
     article.prepare(self)
     firstitem = article.doc.cssselect("#content div.left_div div.item")[0]
     article.props.text = firstitem.cssselect("p, div")
     article.props.date = readDate(firstitem.cssselect("span.date")[0].text)
     article.props.section = "nieuws"
     article.props.headline = firstitem.cssselect("h3")[0].text
     article.props.externalid = url.split("/")[-1]
     em = firstitem.cssselect("em")
     if em:
         article.props.author = "".join(em[0].text.split("Door ")[1:])
     yield article
 def get_comments(self, article):
     for li in article.doc.cssselect("li.comment"):
         comment = HTMLDocument()
         comment.props.text = li.cssselect("div.comment-text")[0]
         
         pattern = re.compile("Geplaatst door ([\w ]+) op ([\w :]+)")
         result = pattern.search(
             li.cssselect("div.commentsbox span")[0].text_content()
             )
         comment.props.author = result.group(1)
         comment.props.date = readDate(result.group(2))
         comment.parent = article
         yield comment
Beispiel #38
0
 def _scrape_unit(self, urldoc):
     article = HTMLDocument(url = urldoc[0])
     article.doc = urldoc[1]
     _date = [
         int(urldoc[0].split("/")[6]),
         int(urldoc[0].split("/")[7].split("_")[0]),
         int(urldoc[0].split("/")[8])]
     article.props.date = date(*_date)
     article.props.section = urldoc[0].split("/")[9]
     article.props.author = article.doc.cssselect("div.fullarticle_tagline")[0].text.split("|")[0]
     article.props.headline = article.doc.cssselect("h1.title")[0].text
     article.props.text = article.doc.cssselect("article")[0]
     yield article
Beispiel #39
0
    def _scrape_unit(self, url):
        doc = self.getdoc(url)

        for li in doc.cssselect("div#article ul.news-list li"):
            url = li.cssselect("a")[0].get('href')
            url = urljoin(INDEX_URL, url)
            page = HTMLDocument(date=self.options['date'], url=url)
            page.prepare(self)
            page.doc = self.getdoc(page.props.url)

            try:
                yield self.get_article(page)
            except IndexError:
                pass
Beispiel #40
0
 def scrape_comments(self,page):
     p = page.props.url+"?page={}"
     if not page.doc.cssselect("ul.pager"):
         return
     total = int(page.doc.cssselect("ul.pager li.pager-last a")[0].get('href').split("page=")[-1].split("&")[0]) + 1
     docs = [self.getdoc(p.format(x)) for x in range(total)]
     for doc in docs:
         for div in doc.cssselect("#comments div.comment"):
             comment = HTMLDocument()
             comment.props.text = div.cssselect("div.content")[0]
             comment.props.author = div.cssselect("span.submitted-username")[0].text_content()
             comment.props.date = readDate(div.cssselect("div.submitted div.floatr")[0].text_content())
             comment.parent = page
             yield comment
Beispiel #41
0
 def _scrape_unit(self, bits):
     date, url = bits
     article = HTMLDocument(date = date, url = url)
     article.prepare(self)
     content = article.doc.cssselect("#content")[0]
     article.props.section = content.cssselect("div.info-block p.meta a.label")[0].text
     article.props.headline = content.cssselect("div.title h1")[0].text
     article.props.externalid = url.split("-")[-1].strip("W/")
     article.props.text = content.cssselect("div.article")
     article.props.author = content.cssselect("p.meta span.user a.label")[0].text.strip()
     article.props.tags = set([a.text for a in content.cssselect("ul.taglist li a")])
     article.props.view_count = int(content.cssselect("div.info-block span.view-count")[0].text)
     yield article
     self.clearcookies()
Beispiel #42
0
 def get_article(self, url):
     url = "{}_body.html".format(url[:-5])
     pagenum = url.split("/")[7][0:5]
     article = HTMLDocument(url=url, pagenr=int(pagenum))
     article.doc = self.getdoc(url)
     article.props.headline = article.doc.cssselect(
         "td.artheader")[0].text_content().strip()
     article.props.text = article.doc.cssselect("table.body")[0]
     if article.doc.cssselect("td.artauthor"):
         article.props.author = article.doc.cssselect(
             "td.artauthor")[0].text_content().lstrip("dor")
     article.props.date = self.options['date']
     article.props.section = self.section
     return article
Beispiel #43
0
    def _scrape_unit(self, props):
        props["date"] = self.options["date"]
        if props["type"] == "short":
            yield Document(**props)
        elif props["type"] == "full":
            article = HTMLDocument(**props)
            article.prepare(self)

            article.props.section = " > ".join(
                [li.text_content().strip("|") for li in article.doc.cssselect("#a-breadcrumb-component li.a-nav-item")]
            )
            article.props.headline = article.doc.cssselect("div.column_main_wrapper h1.blue")[0].text
            article.props.text = article.doc.cssselect("div.column_main_wrapper div")[0]
            yield article
Beispiel #44
0
    def _scrape_unit(self, url):
        doc = self.getdoc(url)
        
        for li in doc.cssselect("div#article ul.news-list li"):
            url = li.cssselect("a")[0].get('href')
            url = urljoin(INDEX_URL,url)
            page = HTMLDocument(date = self.options['date'],url=url)
            page.prepare(self)
            page.doc = self.getdoc(page.props.url)

            try:
                yield self.get_article(page) 
            except IndexError:
                pass
Beispiel #45
0
 def get_comments(self, topic):
     first = True
     for page in self.get_pages(topic.doc):
         if first == True:
             comments = page.cssselect("div.post")[1:]
             first = False
         else:
             comments = page.cssselect("div.post")
         for div in comments:
             comment = HTMLDocument()
             comment.parent = topic
             comment.props.author = div.cssselect("div.postholder_top a.username")[0]
             comment.props.date = readDate(div.cssselect("div.postholder_top span.post_time")[0].text_content())
             comment.props.text = div.cssselect("div.postholder_bot div.contents")[0]
             yield comment
Beispiel #46
0
 def _scrape_unit(self, xmlitem):
     html_content = html.fromstring(xmlitem.cssselect("description")[0].text)
     url = xmlitem.cssselect("link")[0].tail.split("&url=")[-1]
     article = HTMLDocument(url = url)
     article.props.headline = " - ".join(xmlitem.cssselect("title")[0].text.split(" - ")[:-1])
     article.props.source = xmlitem.cssselect("title")[0].text.split(" - ")[-1]
     article.props.section = xmlitem.cssselect("category")[0].text
     article.props.date = readDate(xmlitem.cssselect("pubdate")[0].text)
     article.props.snippet = html_content.cssselect("div.lh font")[1].text
     try:
         article.prepare(self)
     except Exception:
         yield article;return
     article.props.html = html.tostring(article.doc)
     yield article
Beispiel #47
0
 def _scrape_unit(self, urldoc):
     url, doc = urldoc
     article = HTMLDocument(url = url)
     article.doc = doc
     article.props.images = [self.open(img.get('src')).read() for img in article.doc.cssselect("div.broodMediaBox div.image img")]
     article.props.headline = "Poll:" + doc.cssselect("#artikel h1")[0].text_content().split(":")[1]
     article.props.byline = doc.cssselect("#artikel span.auteur")[0].text
     article.props.date = readDate(doc.cssselect("#artikel span.datum")[0].text)
     article.props.externalid = article.props.url.split("/")[-2]
     article.props.text = doc.cssselect("#artikelKolom div.zaktxt,p")
     article.props.dateline = doc.cssselect("#artikelKolom span.location")[0]
     for comment in self.get_comments(article):
         comment.is_comment = True
         yield comment
     yield article
Beispiel #48
0
    def _scrape_unit(self, ipage):
        page = ipage
        ipage = HTMLDocument(ipage)
        ipage.doc = self.open(page['url'])
        
        text = wegenertools.clean(ipage.doc.read())
        err_text = "Uw account is niet geregistreerd voor de door u gekozen uitgave."
        if err_text in text:
            raise Exception(err_text)
        for article_ids in wegenertools.get_article_ids(text):
            body,headline,byline = wegenertools.get_article(text,article_ids)
            if len(body) >= 300: #filtering non-articles, image links and other html crap
                artpage = HTMLDocument()
                stop = False
                for part in body.split("\n\n"):
                    if part.isupper():
                        pass
                    else:
                        if "\n" in part:
                             #when title has a linebreak it's probably not an article
                            stop=True
                            break
                        else:
                            
                            artpage.props.headline = part
                            break
                if stop:
                    break
                else:
                    
                    p = re.compile("[\\\]udc[\w\w]")
                    artpage.props.text = literal_eval(p.sub("",repr(body)))
                    artpage.props.edition = page['edition']
                    artpage.props.byline = byline
                    artpage.props.section = page['section']
                    if re.match("[A-Z][0-9]+", page['page_str']):
                        artpage.props.section += " - section " + page['page_str'][0]
                        artpage.props.pagenr = int(page['page_str'][1:])
                    else:
                        artpage.props.pagenr = int(page['page_str'])

                    dateline_pattern = re.compile("(^[^\n]+\n\n([A-Z]+( [A-Z]+)?) -\n)|(([A-Z]+( [A-Z]+)?)\n\n)")
                    match = dateline_pattern.search(artpage.props.text)
                    if match:
                        #dateline and theme have the same syntax and are therefore undistinguishable
                        artpage.props.dateline_or_theme = match.group(2) or match.group(5)
                    artpage.props.url = page['url']
                    yield artpage
Beispiel #49
0
    def scrape_file(self, _html, t):
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = HTMLDocument()
            article.props.html = div
            article.props.headline = div.cssselect(
                "#articleTitle")[0].text_content()
            article.props.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.props.pagenr, article.props.section = self.get_pagenum(
                    articlepage[0].text)

            if not div.cssselect("#sourceTitle")[0].text:
                article.props.medium = Medium.get_or_create("unknown medium")
            else:
                article.props.medium = Medium.get_or_create(
                    div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.props.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Beispiel #50
0
    def _get_units(self):
        for x in range(3):
            try:
                self._cookie()
            except:
                print(
                    'Error 503 at _cookie function, trying again in a minute...'
                )
                time.sleep(60)
            else:
                break

        index_dict = {
            'y': self.options['date'].year,
            'm': self.options['date'].month,
            'd': self.options['date'].day
        }

        url = INDEX_URL.format(**index_dict)

        for x in range(3):
            try:
                index = self.getdoc(url)
            except Exception:
                time.sleep(5)
            else:
                break

        articles = index.cssselect('.title')
        for article_unit in articles:
            href = article_unit.cssselect('a')[0].get('href')
            yield HTMLDocument(url=href, date=self.options['date'])
Beispiel #51
0
    def _get_units(self):
        """
        PhpBB forum scraper
        """
        index = self.getdoc(self.index_url)

        for cat_title, cat_doc in self.get_categories(index):
            for page in self.get_pages(cat_doc):
                for fbg in page.cssselect('.forumbg'):
                    for li in fbg.cssselect('.topics > li'):
                        url = urljoin(
                            self.index_url,
                            li.cssselect("a.topictitle")[0].get('href'))
                        _date = etree.tostring(
                            li.cssselect("dd.lastpost")[0]).split("br />")[1]
                        date = toolkit.readDate(_date)
                        yield {
                            'date':
                            date,
                            'object':
                            HTMLDocument(
                                headline=li.cssselect("a.topictitle")[0].text,
                                url=url,
                                category=cat_title)
                        }
Beispiel #52
0
 def _scrape_unit(self, url):
     """gets articles from a page"""
     _json = self.open(str(url)).read()
     data = json.loads(_json)
     done = False
     while data['has_more_items'] and done == False:
         doc = html.fromstring(data['items_html'])
         for div in doc.cssselect("div.tweet"):
             tweet = HTMLDocument()
             tweet.props.author = div.cssselect(
                 "strong.fullname")[0].text_content()
             tweet.props.date = datetime.fromtimestamp(
                 float(
                     div.cssselect("a.tweet-timestamp ._timestamp")[0].get(
                         'data-time')))
             tweet.props.text = div.cssselect("p.js-tweet-text")[0]
             maxid = div.get('data-tweet-id')
             if tweet.props.date.date() < self.options['date']:
                 done = True
                 break
             elif tweet.props.date.date() == self.options['date']:
                 yield tweet
         if done == False:
             nexturl = url + "&max_id={}".format(maxid)
             data = json.loads(self.open(str(nexturl)).read())
Beispiel #53
0
    def _get_units(self):
        back = (date.today() - self.options['date']).days
        index_text = self.open(
            self.index_url.format(**locals())).read().decode('utf-8')
        #A character in the header makes the html library fail to parse the page correctly (it silently returns half the page without warning -.-)
        #The character is located in the class attribute of each article tag that is to be scraped, so we take the article tag's inner wrapper and parse that instead.
        article = 0
        arts = []
        for part in index_text.split("<article"):
            article = part.split("</article>")[0]
            arts.append(article)

        for art in set(arts):
            item = html.fromstring(art)
            try:
                _time = time(
                    *map(int,
                         item.cssselect("div.time")[0].text.split(":")))
            except IndexError:
                continue
            article = HTMLDocument(
                date=datetime.combine(self.options['date'], _time),
                headline=item.cssselect("h2.title")[0].text,
                url=urljoin(
                    self.index_url.format(**locals()),
                    item.cssselect("h2.title")[0].getparent().get('href')),
            )
            yield article
Beispiel #54
0
    def _get_units(self):
        self.open("http://www.powned.tv")
        self.open("http://cookies.publiekeomroep.nl/accept/")
        d = self.options['date']
        docs = []
        for x in range(d.day - 7, d.day + 7):
            archive_url = ARCHIVE_URL.format(**locals())
            try:
                doc = self.getdoc(archive_url)
            except HTTPError:
                pass
            else:
                docs.append(doc)

        entries = set([])
        for doc in docs:
            for li in doc.cssselect("ul.articlelist li"):

                _date = readDate(
                    " ".join(li.cssselect("span.t")[0].text.split()[:2]) +
                    " " + str(self.options['date'].year)).date()
                url = urljoin(archive_url, li.cssselect("a")[0].get('href'))
                entries.add((_date, url))

        for _date, url in entries:

            if _date == self.options['date']:
                article = HTMLDocument(date=_date, url=url)
                yield article
Beispiel #55
0
 def extract_articles(self, url):
     try:
         doc = self.getdoc(url)
     except HTTPError:
         return
     for tag in doc.cssselect("#main article.news"):
         if 'poll' in tag.get('class'):
             continue
         _date = datetime.fromtimestamp(int(tag.get('created')))
         article = HTMLDocument(date = _date)
         if tag.cssselect("div.tweet"):
             article.props.type = "tweet"
             article.props.text = tag.cssselect("p")[0]
             article.props.author = article.props.text.cssselect("b a")[0].get('title')
             article.props.url = url.split("?")[0]
         elif tag.cssselect("div.quoteBody"):
             article.props.type = "quote"
             a = tag.cssselect("div.quoteBody a")[0]
             article.props.text = a.text_content()
             article.props.url = urljoin(url, a.get('href'))
             article.props.author = tag.cssselect("span.author")[0].text.strip()
         elif tag.cssselect("div.videoContainer") or 'promo' in tag.get('class'):
             continue
         elif tag.cssselect("div.tagline h4"):
             self.stories.add(urljoin(url, tag.cssselect("h4 a")[0].get('href')))
             continue
         else:
             h = tag.cssselect("div.body h3")[0]
             article.props.type = "article"
             article.props.headline = h.text_content().strip()
             if h.cssselect("a"):
                 article.props.url = urljoin(url, h.cssselect("a")[0].get('href'))
             else:
                 article.props.url = url
         yield article
Beispiel #56
0
    def _scrape_unit(self, url):
        article = HTMLDocument(url = url)
        article.prepare(self)

        article.props.text = article.doc.cssselect("#article p:not(#article-info):not(#metadata)")
        info = article.doc.cssselect("#article-info a")
        article.props.date = readDate(info[0].text)
        article.props.section = info[1].text
        article.props.page_str = info[2].text
        article.props.headline = article.doc.cssselect("#article h1")[0].text
        if article.doc.cssselect("#metadata"):
            metadata = article.doc.cssselect("#metadata")[0].text_content().split("|")
            for m in metadata:
                if m.strip().startswith("Trefwoord"):
                    article.props.tags = [t.strip() for t in m.strip().split(";")]
        yield article