Beispiel #1
0
    def scrape_file(self, _html, t):
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = HTMLDocument()
            article.props.html = div
            article.props.headline = div.cssselect(
                "#articleTitle")[0].text_content()
            article.props.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.props.pagenr, article.props.section = self.get_pagenum(
                    articlepage[0].text)

            if not div.cssselect("#sourceTitle")[0].text:
                article.props.medium = Medium.get_or_create("unknown medium")
            else:
                article.props.medium = Medium.get_or_create(
                    div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.props.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Beispiel #2
0
 def get_medium(self, text):
     if not text:
         text = "unknown"
     if text in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[text])
     else:
         return Medium.get_or_create(text)
Beispiel #3
0
    def scrape_file(self, _html, t):
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
            
        for div in divs:
            article = HTMLDocument()
            article.props.html = div
            article.props.headline = div.cssselect("#articleTitle")[0].text_content()
            article.props.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.props.pagenr, article.props.section = self.get_pagenum(articlepage[0].text)

            if not div.cssselect("#sourceTitle")[0].text:
                article.props.medium = Medium.get_or_create("unknown medium")
            else:
                article.props.medium = Medium.get_or_create(div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.props.date = readDate(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Beispiel #4
0
    def test_medium(self):
        import functools
        header = ('kop', 'datum', 'tekst', 'med')
        data = [('kop1', '2001-01-01', '', 'Bla')]

        test = functools.partial(_run_test_csv,
                                 header,
                                 data,
                                 text="tekst",
                                 headline="kop",
                                 date="datum")
        articles = test(medium_name=None, medium="med")
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "Bla")

        articles = test(medium_existing=Medium.get_or_create("1").id)
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "1")

        articles = test(medium_existing=Medium.get_or_create("1").id,
                        medium="med")
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "Bla")

        articles = test(medium_name="bla2", medium="med")
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "Bla")

        articles = test(medium_name="bla2",
                        medium_existing=Medium.get_or_create("2").id)
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "2")
 def get_medium(self, text):
     if not text:
         text = "unknown"
     if text in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[text])
     else:
         return Medium.get_or_create(text)
Beispiel #6
0
 def create_medium(self, medium):
     if not medium or len(medium) < 1:
         medium = "unknown"
     if medium in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[medium])
     else:
         return Medium.get_or_create(medium)
Beispiel #7
0
    def test_medium(self):
        import functools
        header = ('kop', 'datum', 'tekst', 'med')
        data = [('kop1', '2001-01-01', '', 'Bla')]

        test = functools.partial(_run_test_csv, header, data, text="tekst", headline="kop", date="datum")
        articles = test(medium_name=None, medium="med")
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "Bla")

        articles = test(medium_existing=Medium.get_or_create("1").id)
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "1")

        articles = test(medium_existing=Medium.get_or_create("1").id, medium="med")
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "Bla")

        articles = test(medium_name="bla2", medium="med")
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "Bla")

        articles = test(medium_name="bla2", medium_existing=Medium.get_or_create("2").id)
        self.assertEqual(len(articles), 1)
        self.assertEqual(articles[0].medium.name, "2")
 def create_medium(self, medium):
     if not medium or len(medium) < 1:
         medium = "unknown"
     if medium in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[medium])
     else:
         return Medium.get_or_create(medium)
Beispiel #9
0
 def create_medium(self, html):
     if not html.text:
         medium = "unknown"
     else:
         medium = html.text
     if medium in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[medium])
     else:
         return Medium.get_or_create(medium)
Beispiel #10
0
 def create_medium(self, html):
     if not html.text:
         medium = "unknown"
     else:
         medium = html.text
     if medium in MEDIUM_ALIASES.keys():
         return Medium.get_or_create(MEDIUM_ALIASES[medium])
     else:
         return Medium.get_or_create(medium)
Beispiel #11
0
 def _scrape_unit(self, unit):
     (scraper, unit) = unit
     for article in scraper._scrape_unit(unit):
         if not article.is_comment:
             article.props.medium = Medium.get_or_create(scraper.medium_name)
         else:
             article.props.medium = Medium.get_or_create(scraper.medium_name + " - Comments")
         if not hasattr(article.props, 'text'):
             article.props.text = pformat(article.props.results)
         yield article
Beispiel #12
0
 def _scrape_unit(self, unit):
     (scraper, unit) = unit
     self.medium_name = scraper.medium
     for article in scraper._scrape_unit(unit):
         if not article.is_comment:
             article.props.medium = Medium.get_or_create(scraper.medium)
         else:
             article.props.medium = Medium.get_or_create(scraper.medium + " - Comments")
         if not hasattr(article.props, 'text'):
             article.props.text = pformat(article.props.results)
         yield article
Beispiel #13
0
 def _get_units(self):
     self.medium = Medium.get_or_create(self.medium_name)
     """        doc = self.getdoc(self.index_url)
     skip = True
     for li in doc.cssselect("ol.childforum li.forumbit_post"):
         forum_url = urljoin(doc.base_url,li.cssselect("h2.forumtitle a")[0].get('href'))
         if START_AT0] in forum_url:
             skip = False
         if skip:
             continue
         for page in self.__get_pages(forum_url):
             for li in page.cssselect("#threads li.threadbit"):
                 try:
                     unit = li.cssselect("h3.threadtitle a")[0].get('href')
                 except IndexError as e:
                     print(e)
                 else:
                     print(unit, file=UNIT_FILE)
                     yield unit"""
     units = set(map(str.strip, UNIT_FILE.readlines()))
     skip_until = "wie-schrijft-blijft/359215"
     for unit in units:
         if skip_until in unit:
             skip_until = ""
             yield unit
Beispiel #14
0
    def _scrape_unit(self, fn):
        dest = StringIO()
        with self.ftp() as ftp:
            ftp.retrbinary(b'RETR %s' % (fn.encode('latin-1')), dest.write)
        body = STLtoText(dest.getvalue())
        body = body.decode('latin-1', 'ignore').strip().lstrip('888').strip()
        title = fn.split('/')[-1]
        medium = title.split('-')[-1].split('.stl')[0].strip().lower()
        date = getDate(title)

        if medium == 'nos journaal' and int(format(date, '%H')) == 20 and int(
                format(date, '%M')) == 0:
            medium = 'nos journaal 20:00'
        med = Medium.get_or_create(medium)
        if med.id in mediadict:
            print("saving %s as %s" % (med.id, mediadict[med.id]))
            med = Medium.objects.get(id=mediadict[med.id])

        headline = "%s (%s)" % (medium, fn.replace('.stl', '').strip())
        art = Article(headline=headline,
                      text=body,
                      medium=med,
                      date=date,
                      url=fn)
        yield art
Beispiel #15
0
 def _create_medium(self, source):
     try:
         Medium.objects.get(name__iexact=source)
     except Medium.DoesNotExist:
         l = Language.objects.get(id=1)
         Medium(name=source, abbrev=source[0:5], circulation=1,
                language=l).save()
Beispiel #16
0
 def _get_units(self):
     self.medium = Medium.get_or_create(self.medium_name)
     d = self.options['date']
     data = json.loads(self.open("http://www.telegraaf.nl/telegraaf-i/newspapers").read())
     self.paperdata = [i for i in data if i['date'] == "{}-{:02d}-{:02d}".format(d.year,d.month,d.day)][0]
     articles = []
     for page in self.paperdata['pages']:
         articles += page['articles']
     for article_id in articles:
         yield article_id
 def _scrape_unit(self, ftuple):
     title = ftuple[0]
     url = ftuple[1]
     body = ftuple[2]
     
     date = getDate(url)    
     medium = title.lower()
     med = Medium.get_or_create(medium)
 
     art = Article(headline=medium, text=body,
                   medium = med, date=date, url = url)
     yield art
Beispiel #18
0
    def _medium(self):
        if self.options["medium"]:
            return

        if self.options['medium_existing']:
            return self.options['medium_existing']

        if self.options['medium_name']:
            med = Medium.get_or_create(self.options['medium_name'])
            self.options['medium_existing'] = med
            return med

        raise ValueError("No medium specified!")
Beispiel #19
0
    def __init__(self, *args, **kargs):
        super(Scraper, self).__init__(*args, **kargs)
        self.medium = Medium.get_or_create(self.medium_name)
        self.project = self.options['project']
        for k, v in self.options.items():
            if type(v) == str:
                self.options[k] = v.decode('utf-8')

        # avoid django problem/bug with repr(File(open(uncode-string)))
        # https://code.djangoproject.com/ticket/8156   
        o2 = {k:v for k,v in self.options.iteritems() if k != 'file'}
        log.debug(u"Articleset: {self.articleset!r}, options: {o2}"
                  .format(**locals()))
Beispiel #20
0
    def _scrape_unit(self, unit):
        (scraper, unit, rank) = unit
        for article in scraper._scrape_unit(unit):
            article.props.medium = Medium.get_or_create(scraper.source)
            article.props.rank = rank
            for attr in ['headline', 'author', 'text']:
                if hasattr(article.props, attr):

                    value = getattr(article.props, attr)
                    if isinstance(value, str) or isinstance(value, unicode):
                        value = value.strip()
                    setattr(article.props, attr, value)
            yield article
Beispiel #21
0
    def _medium(self):
        if self.options["medium"]:
            return

        if self.options['medium_existing']:
            return self.options['medium_existing']

        if self.options['medium_name']:
            med = Medium.get_or_create(self.options['medium_name'])
            self.options['medium_existing'] = med
            return med

        raise ValueError("No medium specified!")
Beispiel #22
0
    def _scrape_unit(self, unit):
        (scraper, unit, rank) = unit
        for article in scraper._scrape_unit(unit):
            article.props.medium = Medium.get_or_create(scraper.source)
            article.props.rank = rank
            for attr in ['headline', 'author', 'text']:
                if hasattr(article.props, attr):

                    value = getattr(article.props, attr)
                    if isinstance(value, str) or isinstance(value, unicode):
                        value = value.strip()
                    setattr(article.props, attr, value)
            yield article
Beispiel #23
0
def body_to_article(headline, byline, text, date, source, meta):
    """
    Create an Article-object based on given parameters. It raises an
    error (Medium.DoesNotExist) when the given source does not have
    an entry in the database.

    @param headline: headline of new Article-object
    @type headline: str

    @param byline: byline for new Article
    @type byline: NoneType, str

    @param text: text for new Article
    @type text: str

    @param date: date(time) for new Article
    @type date: datetime.date, datetime.datetime

    @param source: medium-label for new Article
    @type source: str

    @param meta: object containing all sorts of meta-information, most of
                 it suitable for metastring. However, some information
                 (author, length) will be extracted.
    @type meta: dictionary

    @return Article-object

    """
    log.debug("Creating article object for {headline!r}".format(**locals()))

    art = Article(headline=headline, byline=byline, text=text, date=date)

    art.medium = Medium.get_or_create(source)

    # Author / Section
    meta = meta.copy()
    art.author = meta.pop('author', None)
    art.section = meta.pop('section', None)
    if 'length' in meta:
        art.length = int(meta.pop('length').split()[0])
    else:
        art.length = art.text.count(" ")
    if 'url' in meta:
        art.url = meta.pop('url')
        art.url = re.sub("\s+", "", art.url)

    art.metastring = str(meta)

    return art
Beispiel #24
0
    def _scrape_unit(self, fn):
        dest = StringIO()
        self._ftp.retrbinary(b'RETR %s' % (fn.encode('latin-1')) , dest.write)
        body = STLtoText(dest.getvalue())
        body = body.decode('latin-1','ignore').strip().lstrip('888').strip()
        title = fn.split('/')[-1]
        medium = title.split('-')[-1].split('.stl')[0].strip().lower()
        date = getDate(title)

        if medium == 'nos journaal' and int(format(date, '%H')) == 20 and int(format(date, '%M')) == 0: medium = 'nos journaal 20:00'
        if medium in mediadict.keys():
            medium = mediadict[medium]
        med = Medium.get_or_create(medium)
        art = Article(headline=medium, text=body,
                      medium = med, date=date, url = fn)
        yield art
Beispiel #25
0
def body_to_article(headline, byline, text, date, source, meta):
    """
    Create an Article-object based on given parameters. It raises an
    error (Medium.DoesNotExist) when the given source does not have
    an entry in the database.

    @param headline: headline of new Article-object
    @type headline: unicode / str

    @param byline: byline for new Article
    @type byline: NoneType, unicode, str

    @param text: text for new Article
    @type text: unicode / str

    @param date: date(time) for new Article
    @type date: datetime.date, datetime.datetime

    @param source: medium-label for new Article
    @type source: unicode / str

    @param meta: object containing all sorts of meta-information, most of
                 it suitable for metastring. However, some information
                 (author, length) will be extracted.
    @type meta: dictionary

    @return Article-object

    """
    log.debug("Creating article object for {headline!r}".format(**locals()))

    art = Article(headline=headline, byline=byline, text=text, date=date)

    art.medium = Medium.get_or_create(source)

    # Author / Section
    meta = meta.copy()
    art.author = meta.pop('author', None)
    art.section = meta.pop('section', None)
    if 'length' in meta:
        art.length = int(meta.pop('length').split()[0])
    else:
        art.length = art.text.count(" ")
    art.metastring = str(meta)

    return art
    def _scrape_unit(self, ftuple):
        title = ftuple[0]
        url = ftuple[1]
        body = ftuple[2]

        date = getDate(url)
        medium = title.lower()
        med = Medium.get_or_create(medium)

        headline = "%s (%s)" % (medium, url.split('/')[-1].replace('.stl',
                                                                   '').strip())
        art = Article(headline=headline,
                      text=body,
                      medium=med,
                      date=date,
                      url=url)
        yield art
Beispiel #27
0
    def parse_document(self, file):
        if file:
            dirname, filename = os.path.split(file.name)
            filename, ext = os.path.splitext(filename)
        else:
            dirname, filename, ext = None, None, None

        metadata = dict((k, v) for (k, v) in self.options.items()
                        if k in ["headline", "project", "date", "section"])
        metadata["medium"] = Medium.get_or_create(self.options['medium'])

        if not metadata["date"]:
            datestring, filename = filename.split("_", 1)
            metadata["date"] = toolkit.read_date(datestring)

        if not metadata["headline"].strip():
            metadata["headline"] = filename

        if not metadata["headline"].strip():
            metadata["headline"] = filename

        if not metadata["section"].strip():
            metadata["section"] = dirname

        if file:
            convertors = None
            if ext.lower() == ".docx":
                convertors = [_convert_docx, _convert_doc]
            elif ext.lower() == ".doc":
                convertors = [_convert_doc, _convert_docx]
            elif ext.lower() == ".pdf":
                convertors = [_convert_pdf]

            if convertors:
                text = _convert_multiple(file, convertors)
            else:
                text = file.text
        else:
            text = self.options['text']

        return Article(text=text, **metadata)
    def parse_document(self, file):
        if file:
            dirname, filename = os.path.split(file.name)
            filename, ext = os.path.splitext(filename)
        else:
            dirname, filename, ext = None, None, None

        metadata = dict((k, v) for (k,v) in self.options.items()
                        if k in ["headline", "project", "date", "section"])
        metadata["medium"] = Medium.get_or_create(self.options['medium'])
        
        if not metadata["date"]:
            datestring, filename = filename.split("_", 1)
            metadata["date"] = toolkit.read_date(datestring)
            
        if not metadata["headline"].strip():
            metadata["headline"] = filename

        if not metadata["headline"].strip():
            metadata["headline"] = filename
            
        if not metadata["section"].strip():
            metadata["section"] = dirname

        if file:
            convertors = None
            if ext.lower() == ".docx":
                convertors = [_convert_docx, _convert_doc]
            elif ext.lower() == ".doc":
                convertors = [_convert_doc, _convert_docx]
            elif ext.lower() == ".pdf":
                convertors = [_convert_pdf]

            if convertors:
                text = _convert_multiple(file, convertors)
            else:
                text = file.text
        else:
            text = self.options['text']
            
        return Article(text=text, **metadata)
Beispiel #29
0
    def _postprocess_article(self, article):
        """
        Finalize an article. This should convert the output of _scrape_unit to the required
        output for scrape_unit, e.g. convert to Article, add project and/or medium
        """
        comment = False
        if isinstance(article, Document):
            if hasattr(article, 'is_comment') and article.is_comment:
                if not hasattr(self, 'comment_medium'):
                    self.comment_medium = Medium.get_or_create(self.medium_name + " - Comments")
                comment = True
            article = article.create_article()

        if comment:
            _set_default(article, "medium", self.comment_medium)
        else:
            _set_default(article, "medium", self.medium)

        _set_default(article, "project", self.project)
        article.scraper = self
        return article
Beispiel #30
0
 def parse_document(self, tupleText):
     meta, body = tupleText
     meta = meta.strip()
     meta = meta.split('\n')
     kargs = {}
     kargs['externalid'] = int(meta[0].split('.')[0].lstrip('?'))
     kargs['headline'] = meta[0].partition('. ')[2]
     
     medium_name, date, pagenr, length = meta[2].split(', ')
     kargs['medium'] = Medium.get_or_create(medium_name)
     kargs['date'] = readDate(date)
     kargs['pagenr'] = int(pagenr.strip('p.'))
     kargs['length']  = int(length.strip('w.'))
     
     body = body.split('\n')
     kargs['section'] = body[2]
     
     kargs['text'] = '\n'.join(body[5:])
     
     kargs['project'] = self.options['project']
     
     return Article(**kargs)
from amcat.models.medium import Medium
from amcat.models.article import Article
from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES as aliases

for alias, medium in aliases.items():
    if alias != medium:
        print(alias, " > ", medium)
        #change all articles in project 29
        alias = Medium.get_or_create(alias)
        articles = Article.objects.filter(medium=alias.id, project_id=29)
        print("{} articles".format(articles.count()))
        articles.update(medium=Medium.get_or_create(medium).id)
        #if medium is now empty, delete
        if Article.objects.filter(medium=alias.id).count() == 0:
            print('deleting...')
            alias.delete()
    else:
        print('alias is no alias')
Beispiel #32
0
def media(request):
    canadd = Medium.can_create(request.user)
    media = Datatable(MediumResource)
    return render(request, 'navigator/report/media.html', locals())
Beispiel #33
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = readDate(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():  #headline
                article.headline = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = readDate(datestr)
                    if (
                            article.date - file_date
                    ).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = readDate(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.medium = Medium.get_or_create(medium_str)
                article.section = data[1]

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(
                    p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode(
                'latin-1')
            if code == "92": return "'"
            elif code == "85": return "..."
            return char

        article.text = re.sub("=[A-Z0-9]{2}", character, article.text)

        yield article
Beispiel #34
0
def media(request):
    canadd = Medium.can_create(request.user)
    media = Datatable(MediumResource)
    return render(request, 'navigator/report/media.html', locals())
from amcat.models.medium import Medium
from amcat.models.article import Article
from amcat.scripts.article_upload.bzk_aliases import BZK_ALIASES as aliases

for alias, medium in aliases.items():
    if alias != medium:
        print(alias, " > ", medium)
    #change all articles in project 29
        alias = Medium.get_or_create(alias)
        articles = Article.objects.filter(medium = alias.id, project_id = 29)
        print("{} articles".format(articles.count()))
        articles.update(medium = Medium.get_or_create(medium).id)
    #if medium is now empty, delete
        if Article.objects.filter(medium = alias.id).count() == 0:
            print('deleting...')
            alias.delete()
    else:
        print('alias is no alias')

Beispiel #36
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = readDate(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():  #headline
                article.headline = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = readDate(datestr)
                    if (
                                article.date - file_date).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = readDate(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.medium = Medium.get_or_create(medium_str)
                article.section = data[1]

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode('latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub(
            "=[A-Z0-9]{2}",
            character,
            article.text)

        yield article