Esempio n. 1
0
    def getarticle(self, headline, lines):
        article = Article(headline=headline)
        text = ""
        for line in lines[2:]:
            if len(line) > 2:
                text += "\n" + line

        text = text.replace("-\n", "")
        text = text.replace("  ", " ")
        text = text.replace("\n", " ")

        article.text = text
        date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})")
        result = date_pattern.search(lines[1])
        article.date = date(int(result.group(3)), int(result.group(2)),
                            int(result.group(1)))
        pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)")
        result = pagenum_pattern.search(lines[1])
        if result:

            article.pagenr = int(result.group(1))

        for h, medium in self.index:
            if article.headline.lower().strip() in h.lower().strip():
                article.set_property("medium", self.get_medium(medium))

        return article
Esempio n. 2
0
    def getarticle(self, headline, lines):
        article = Article(headline = headline)
        text = ""
        for line in lines[2:]:
            if len(line) > 2:
                text += "\n" + line

        text = text.replace("-\n","")
        text = text.replace("  "," ")
        text = text.replace("\n"," ")

        article.text = text
        date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})")
        result = date_pattern.search(lines[1])
        article.date = date(
            int(result.group(3)),
            int(result.group(2)),
            int(result.group(1)))
        pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)")
        result = pagenum_pattern.search(lines[1])
        if result:
            
            article.pagenr = int(result.group(1))

        for h, medium in self.index:
            if article.headline.lower().strip() in h.lower().strip():
                article.set_property("medium", self.get_medium(medium))

        return article
 def parse_item(self, item):
     #item: a list of html tags
     article = Article(metastring = {})
     for tag in item:
         if tag.tag in ("p","div"):
             if not (hasattr(article,'text') or article.text):
                 article.text.append(tag)
             else:
                 article.text = [tag]
         elif tag.tag == "h2":
             article.headline = tag.text
         elif tag.tag == "i":
             article = self.parse_dateline(tag.text, article)
     #process html
     article.text = "\n".join([html2text(html.tostring(bit)) for bit in article.text])
     return article
Esempio n. 4
0
    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = Article(metastring={})
            article.metastring['html'] = div
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.pagenr, article.section = self.get_pagenum(
                    articlepage[0].text)

            article.medium = self.get_medium(
                div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Esempio n. 5
0
    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
        else:
            raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.")

        for div in divs:
            article = Article(metastring=div.text_content())
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0].text_content()
            articlepage = div.cssselect("#articlePage")

            if articlepage:
                article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content())

            article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content())
            date_str = div.cssselect("#articleDate")[0].text_content()

            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Esempio n. 6
0
    def parse_item(self, item):
        #item: a list of html tags
        article = Article(metastring={})
        for tag in item:
            if tag.tag in ("p", "div"):
                if not (hasattr(article, 'text') or article.text):
                    article.text.append(tag)
                else:
                    article.text = [tag]
            elif tag.tag == "h2":
                article.headline = tag.text
            elif tag.tag == "i":
                article = self.parse_dateline(tag.text_content(), article)
        #process html
        article.text = "\n".join(
            [html2text(html.tostring(bit)) for bit in article.text])

        return article
Esempio n. 7
0
    def parse_item(self, item):
        #item: a list of html tags
        article = Article(metastring={})
        article.text = self._parse_text(item)

        for tag in item:
            if tag.tag == "h2":
                article.headline = tag.text
            elif tag.tag == "i":
                article = self.parse_dateline(tag.text_content(), article)

        return article
Esempio n. 8
0
 def parse_item(self, item):
     #item: a list of html tags
     article = Article(metastring={})
     article.text = self._parse_text(item)
     for tag in item:
         if tag.tag == "h2":
             if tag.text:
                 article.headline = tag.text
             else:
                 article.headline = tag.cssselect("span")[0].text_content()
         elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")):
             article = self.parse_dateline(tag.text_content(), article)
     if not article.headline:
         raise Exception("Article has no headline")
     return article
Esempio n. 9
0
 def parse_item(self, item):
     #item: a list of html tags
     article = Article(metastring={})
     article.text = self._parse_text(item)
     for tag in item:
         if tag.tag == "h2":
             if tag.text:
                 article.headline = tag.text
             else:
                 article.headline = tag.cssselect("span")[0].text_content()
         elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")):
             article = self.parse_dateline(tag.text_content(), article)
     if not article.headline:
         raise Exception("Article has no headline")
     return article
Esempio n. 10
0
 def parse_item(self, item):
     #item: a list of html tags
     article = Article()
     article.text = self._parse_text(item)
     headline_found = False
     dateline_found = False
     for tag in item:
         if tag.tag == "h2" and not headline_found:
             if tag.text:
                 article.headline = tag.text
             else:
                 article.headline = tag.cssselect("span")[0].text_content()
             headline_found = True
         elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")) and not dateline_found:
             article = self.parse_dateline(tag.text_content(), article)
             dateline_found = True
     if not article.headline:
         raise Exception("Article has no headline")
     return article
 def scrape_3(self, _html):
     """Some ugly MS Word format, as of 2014-03-03"""
     # Partition articles
     part = []
     articles = []
     for tag in _html.cssselect("body > div > *"):
         if tag.cssselect("hr"):
             articles.append(part)
             part = []
         else:
             part.append(tag)
     for tags in articles[1:]:
         article = Article()
         dateline = tags[1].text_content().strip()
         article = self.parse_dateline(dateline, article)
         article.headline = tags[1].text_content().strip()
         html_str = "".join([html.tostring(t) for t in tags[2:]])
         article.text = html2text(html_str)
         article.metastring = {'html': html_str}
         
         yield article
Esempio n. 12
0
    def scrape_3(self, _html):
        """Some ugly MS Word format, as of 2014-03-03"""
        # Partition articles
        part = []
        articles = []
        for tag in _html.cssselect("body > div > *"):
            if tag.cssselect("hr"):
                articles.append(part)
                part = []
            else:
                part.append(tag)
        for tags in articles[1:]:
            article = Article()
            dateline = tags[1].text_content().strip()
            article = self.parse_dateline(dateline, article)
            article.headline = tags[1].text_content().strip()
            html_str = "".join([html.tostring(t) for t in tags[2:]])
            article.text = html2text(html_str)
            article.metastring = {'html': html_str}

            yield article
Esempio n. 13
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = read_date(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():
                article.title = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = read_date(datestr)
                    if (
                            article.date - file_date
                    ).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = read_date(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.set_property("medium", medium_str)
                article.set_property("section", data[1])

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(
                    p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode(
                'latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub("=[A-Z0-9]{2}", character, article.text)

        yield article
Esempio n. 14
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = read_date(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():
                article.title = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = read_date(datestr)
                    if (
                                article.date - file_date).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = read_date(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.set_property("medium", medium_str)
                article.set_property("section", data[1])

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode('latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub(
            "=[A-Z0-9]{2}",
            character,
            article.text)

        yield article
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = readDate(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"): #actual content starts
                lines.append("")

        article = Article(metastring = {'mail_header': "".join(mail_header)})

        while True: #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper(): #headline
                article.headline = line
                break
            elif line: #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr: # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = readDate(datestr)
                    if (article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years = 1)
                else:
                    article.date = readDate(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.medium = Medium.get_or_create(medium_str)
                article.section = data[1]

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper(): #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content
                break

        yield article