Ejemplo n.º 1
0
def _is_date(string):
    try:
        toolkit.read_date(string)
    except ValueError:
        return False

    return True
Ejemplo n.º 2
0
def _is_date(string, language_pool=None):
    if not re.search("\d", string):
        return False  # no number = no date, optimizatino because dateparse is very slow on non-matches
    try:
        toolkit.read_date(string, language_pool=language_pool)
    except ValueError:
        return False

    return True
Ejemplo n.º 3
0
def _is_date(string):
    if not re.search("\d", string):
        return False  # no number = no date, optimizatino because dateparse is very slow on non-matches
    try:
        toolkit.read_date(string)
    except ValueError:
        return False

    return True
Ejemplo n.º 4
0
 def parse_dateline(cls, text, article):
     bits = text.split()
     if "-" in bits[-1]:
         article["date"] = read_date(bits[-1])
         article["medium"] = cls.get_medium(" ".join(bits[:-1]))
     elif bits[-1].isdigit() and bits[-3].isdigit():
         article["date"] = read_date(" ".join(bits[-3:]))
         article["medium"] = cls.get_medium(" ".join(bits[:-3]))
     else:
         article["medium"] = cls.get_medium(" ".join(bits))
         article["date"] = None
     return article
Ejemplo n.º 5
0
 def parse_dateline(cls, text, article):
     bits = text.split()
     if not bits:
         raise ParseError("Couldn't find date in article: {}".format(article['title']))
     if "-" in bits[-1]:
         article["date"] = read_date(bits[-1])
         article["medium"] = cls.get_medium(" ".join(bits[:-1]))
     elif bits[-1].isdigit() and bits[-3].isdigit():
         article["date"] = read_date(" ".join(bits[-3:]))
         article["medium"] = cls.get_medium(" ".join(bits[:-3]))
     else:
         article["medium"] = cls.get_medium(" ".join(bits))
         article["date"] = None
     return article
Ejemplo n.º 6
0
 def test_readdate(self):
     for s, date in (
         ("22 maart 1980", datetime.datetime(1980, 3, 22, 0, 0, 0)),
         ("22 mrt 1980", datetime.datetime(1980, 3, 22, 0, 0, 0)),
         ("22/3/1980", datetime.datetime(1980, 3, 22, 0, 0, 0)),
         ("1980-05-02", datetime.datetime(1980, 5, 2, 0, 0, 0)),
         ("1980-3-22", datetime.datetime(1980, 3, 22, 0, 0, 0)),
         ("1980-3-22T01:00:05", datetime.datetime(1980, 3, 22, 1, 0, 5)),
         ("1980-3-22 01:00", datetime.datetime(1980, 3, 22, 1, 0, 0)),
         ("1980-3-22 01:00 PM", datetime.datetime(1980, 3, 22, 13, 0, 0)),
         ("1/1/98", datetime.datetime(1998, 1, 1, 0, 0, 0)),
         ("1/1/04", datetime.datetime(2004, 1, 1, 0, 0, 0)),
         ("31/12/72", datetime.datetime(1972, 12, 31, 0, 0, 0)),
         ("1/2/1972", datetime.datetime(1972, 2, 1, 0, 0, 0)),
         ("30.09.2008", datetime.datetime(2008, 9, 30, 0, 0, 0)),
         ("02.09.2008", datetime.datetime(2008, 9, 2, 0, 0, 0)),
         ("30-09-2008", datetime.datetime(2008, 9, 30, 0, 0, 0)),
         ("02-09-2008", datetime.datetime(2008, 9, 2, 0, 0, 0)),
         ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0, 0)),
         ("March 31, 2003", datetime.datetime(2003, 3, 31, 0, 0, 0)),
         ("December 31, 2009 Thursday",
          datetime.datetime(2009, 12, 31, 0, 0, 0)),
         (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0, 0)),
         ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0, 0)),
         ('September 1, 2008 Monday 12:44 PM AEST',
          datetime.datetime(2008, 9, 1, 12, 44)),
         ('23aug2013', datetime.datetime(2013, 8, 23, 0, 0, 0)),
     ):
         date2 = toolkit.read_date(s)
         self.assertEqual(date, date2, "while parsing {}".format(repr(s)))
Ejemplo n.º 7
0
 def test_readdate(self):
     for s, date in (
         ("22 maart 1980" , datetime.datetime(1980, 3, 22,0,0,0)),
         ("22 mrt 1980" , datetime.datetime(1980, 3, 22,0,0,0)),
         ("22/3/1980" , datetime.datetime(1980, 3, 22,0,0,0)),
         ("1980-05-02" , datetime.datetime(1980, 5, 2,0,0,0)),
         ("1980-3-22" , datetime.datetime(1980, 3, 22,0,0,0)),
         ("1980-3-22T01:00:05" , datetime.datetime(1980, 3, 22,1,0,5)),
         ("1980-3-22 01:00" , datetime.datetime(1980, 3, 22,1,0,0)),
         ("1980-3-22 01:00 PM" , datetime.datetime(1980, 3, 22,13,0,0)),
         ("1/1/98", datetime.datetime(1998, 1, 1,0,0,0)),
         ("1/1/04", datetime.datetime(2004, 1, 1,0,0,0)),
         ("31/12/72", datetime.datetime(1972, 12, 31,0,0,0)),
         ("1/2/1972", datetime.datetime(1972, 2, 1,0,0,0)),
         ("30.09.2008", datetime.datetime(2008, 9, 30,0,0,0)),
         ("02.09.2008", datetime.datetime(2008, 9, 2,0,0,0)),
         ("30-09-2008", datetime.datetime(2008, 9, 30,0,0,0)),
         ("02-09-2008", datetime.datetime(2008, 9, 2,0,0,0)),
         ("31. Januar 2009", datetime.datetime(2009, 1, 31, 0, 0, 0)),
         ("March 31, 2003", datetime.datetime(2003, 3, 31, 0, 0, 0)),
         ("December 31, 2009 Thursday", datetime.datetime(2009, 12, 31, 0, 0, 0)),
         (u'30 ao\xfbt 2002', datetime.datetime(2002, 8, 30, 0, 0, 0)),
         ('31. Maerz 2003', datetime.datetime(2003, 3, 31, 0, 0, 0)),
         ('September 1, 2008 Monday 12:44 PM AEST', datetime.datetime(2008, 9, 1, 12, 44)),
         ('23aug2013', datetime.datetime(2013, 8, 23, 0, 0, 0)),
     ):
         date2 = toolkit.read_date(s)
         self.assertEqual(date, date2, "while parsing {}".format(repr(s)))
Ejemplo n.º 8
0
    def scrape_1(cls, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
        else:
            raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.")

        for div in divs:
            article = {"html": div.text_content()}
            article["title"] = div.cssselect("#articleTitle")[0].text_content()
            article["text"] = div.cssselect("#articleIntro")[0].text_content()
            articlepage = div.cssselect("#articlePage")

            if articlepage:
                article["pagenr"], article["section"] = cls.get_pagenum(articlepage[0].text_content())

            article["medium"] = cls.get_medium(div.cssselect("#sourceTitle")[0].text_content())
            date_str = div.cssselect("#articleDate")[0].text_content()

            try:
                article["date"] = read_date(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Ejemplo n.º 9
0
    def scrape_2(cls, _html):
        """New format as of 2014 and a few days before"""
        title = _html.cssselect("h1")[0]
        if not title.text:
            title = title.cssselect("span")[0]
        docdate = read_date(title.text.split("-")[1])

        # split body by <hr>
        items = []
        item = []
        
        if len(_html.cssselect("body > hr")) == 0:
            # select MS Word div wrapper
            tags = _html.cssselect("body > div.WordSection1 > *")
            if len(tags) == 0:
                    raise ParseError("Document format is not supported")

        else:
            tags = _html.cssselect("body > *")

        for child in tags:
            if child.tag == "hr" or (child.tag == "div" and child.cssselect("span > hr")):
                items.append(item)
                item = []
            else:
                item.append(child)

        # first item is the index
        items = items[1:]
        for item in items:
            article = cls.parse_item(item)
            if not article["date"]:
                article["date"] = docdate
            yield article
Ejemplo n.º 10
0
    def scrape_2(cls, _html):
        """New format as of 2014 and a few days before"""
        title = _html.cssselect("h1")[0]
        if not title.text:
            title = title.cssselect("span")[0]
        docdate = read_date(title.text.split("-")[1])

        # split body by <hr>
        items = []
        item = []
        
        if len(_html.cssselect("body > hr")) == 0:
            # select MS Word div wrapper
            tags = _html.cssselect("body > div.WordSection1 > *")
            if len(tags) == 0:
                    raise ParseError("Document format is not supported")

        else:
            tags = _html.cssselect("body > *")

        for child in tags:
            if child.tag == "hr" or (child.tag == "div" and child.cssselect("span > hr")):
                items.append(item)
                item = []
            else:
                item.append(child)

        # first item is the index
        items = items[1:]
        for item in items:
            article = cls.parse_item(item)
            if not article["date"]:
                article["date"] = docdate
            yield article
Ejemplo n.º 11
0
    def scrape_1(cls, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
        else:
            raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.")

        for div in divs:
            article = {"html": div.text_content()}
            article["title"] = div.cssselect("#articleTitle")[0].text_content()
            article["text"] = div.cssselect("#articleIntro")[0].text_content()
            articlepage = div.cssselect("#articlePage")

            if articlepage:
                article["pagenr"], article["section"] = cls.get_pagenum(articlepage[0].text_content())

            article["medium"] = cls.get_medium(div.cssselect("#sourceTitle")[0].text_content())
            date_str = div.cssselect("#articleDate")[0].text_content()

            try:
                article["date"] = read_date(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Ejemplo n.º 12
0
    def parse_document(self, file):
        dirname, filename = os.path.split(file.name)
        filename, ext = os.path.splitext(filename)

            
        
        metadata = dict((k, v) for (k,v) in self.options.items()
                        if k in ["medium", "headline", "project", "date", "section"])
        if not metadata["date"]:
            datestring, filename = filename.split("_", 1)
            metadata["date"] = toolkit.read_date(datestring)
            
        if not metadata["headline"].strip():
            metadata["headline"] = filename

        if not metadata["headline"].strip():
            metadata["headline"] = filename
            
        if not metadata["section"].strip():
            metadata["section"] = dirname

        convertors = None
        if ext.lower() == ".docx":
            convertors = [_convert_docx, _convert_doc]
        elif ext.lower() == ".doc":
            convertors = [_convert_doc, _convert_docx]

        if convertors:
            text = _convert_multiple(file, convertors)
        else:
            text = file.text
        return Article(text=text, **metadata)
Ejemplo n.º 13
0
    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        a = test_article()

        res = self._post_articles(a)
        self.assertEqual(set(res.keys()), {'id'})  # POST should only return IDs

        res = self._get_article(aid=res['id'])
        self.assertEqual(res["title"], a['title'])
        self.assertEqual(toolkit.read_date(res["date"]), toolkit.read_date(a['date']))
        self.assertNotIn("text", res.keys())
        self.assertIsNotNone(res["hash"])

        res = self._get_article(aid=res['id'], text=True)
        self.assertEqual(res["text"], a['text'])

        res = self._get_articles()["results"]
        self.assertEqual(len(res), 1)
Ejemplo n.º 14
0
    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        a = test_article()

        res = self._post_articles(a)
        self.assertEqual(set(res.keys()),
                         {'id'})  # POST should only return IDs

        res = self._get_article(aid=res['id'])
        self.assertEqual(res["title"], a['title'])
        self.assertEqual(toolkit.read_date(res["date"]),
                         toolkit.read_date(a['date']))
        self.assertNotIn("text", res.keys())
        self.assertIsNotNone(res["hash"])

        res = self._get_article(aid=res['id'], text=True)
        self.assertEqual(res["text"], a['text'])

        res = self._get_articles()["results"]
        self.assertEqual(len(res), 1)
Ejemplo n.º 15
0
def parse_page(doc_elements):
    """Parses an APA page given in a list of Etree elements."""
    doc, elements = doc_elements
    elements = [e for e in elements if not isinstance(e, lxml.html.HtmlComment)]

    headline = set(get_descendants(doc.cssselect("b"))) & set(elements)
    meta = (set(get_descendants(doc.cssselect("i"))) & set(elements)) - headline
    text = set(elements) - (headline | meta)
    headline = sorted(get_roots(headline), key=lambda e:elements.index(e))

    if not headline:
        raise ValueError("No possible headlines found.")

    remove_tree(meta, ["b"])
    remove_tree(text, ["b", "i"])

    # Some text in italics is no metadata. We only use text before headline elements
    # for metadata.
    lesser_than_headline = lambda e:elements.index(e) <= elements.index(headline[0])
    meta = get_nonempty(filter(lesser_than_headline, meta))

    # Parse metadata
    metadata = {}
    for el in list(meta):
        if get_metadata(metadata, el):
            meta.remove(el)

    if meta:
        metadata["byline"] = " - ".join(m.text for m in meta)

    # Convert date properties to datetime object
    year, month, day = metadata["year"], metadata["month"], metadata["day"]
    hour, minute = metadata.get("hour"), metadata.get("minute")

    datestring = "{day} {month} {year}"
    if hour is not None and minute is not None:
        datestring += ", {hour}:{minute}"

    metadata["date"] = read_date(datestring.format(**locals()))
    for prop in ("year", "month", "day", "hour", "minute"):
        if prop in metadata:
            del metadata[prop]

    # Clean data and get headline
    metadata["medium"] = metadata.get("medium", "APA - Unknown").strip().strip('"')
    medium, headline = metadata["medium"], "".join(["".join(e.itertext()) for e in headline])

    if medium in headline:
        headline = headline.split("-", medium.count("-") + 1)[-1]

    metadata["headline"] = headline

    # Get text. Since ordering is lost in sets, restore original order of elements
    return metadata, "".join(get_text(sorted(text, key=lambda e:elements.index(e)))).strip()
Ejemplo n.º 16
0
def parse_page(doc_elements):
    """Parses an APA page given in a list of Etree elements."""
    doc, elements = doc_elements
    elements = [e for e in elements if not isinstance(e, lxml.html.HtmlComment)]

    headline = set(get_descendants(doc.cssselect("b"))) & set(elements)
    meta = (set(get_descendants(doc.cssselect("i"))) & set(elements)) - headline
    text = set(elements) - (headline | meta)
    headline = sorted(get_roots(headline), key=lambda e: elements.index(e))

    if not headline:
        raise ValueError("No possible headlines found.")

    remove_tree(meta, ["b"])
    remove_tree(text, ["b", "i"])

    # Some text in italics is no metadata. We only use text before headline elements
    # for metadata.
    lesser_than_headline = lambda e: elements.index(e) <= elements.index(headline[0])
    meta = get_nonempty(filter(lesser_than_headline, meta))

    # Parse metadata
    metadata = {}
    for el in list(meta):
        if get_metadata(metadata, el):
            meta.remove(el)

    if meta:
        metadata["byline"] = " - ".join(m.text for m in meta)

    # Convert date properties to datetime object
    year, month, day = metadata["year"], metadata["month"], metadata["day"]
    hour, minute = metadata.get("hour"), metadata.get("minute")

    datestring = "{day} {month} {year}"
    if hour is not None and minute is not None:
        datestring += ", {hour}:{minute}"

    metadata["date"] = read_date(datestring.format(**locals()))
    for prop in ("year", "month", "day", "hour", "minute"):
        if prop in metadata:
            del metadata[prop]

    # Clean data and get headline
    metadata["medium"] = metadata.get("medium", "APA - Unknown").strip().strip('"')
    medium, headline = metadata["medium"], "".join(["".join(e.itertext()) for e in headline])

    if medium in headline:
        headline = headline.split("-", medium.count("-") + 1)[-1]

    metadata["headline"] = headline

    # Get text. Since ordering is lost in sets, restore original order of elements
    return metadata, "".join(get_text(sorted(text, key=lambda e: elements.index(e)))).strip()
Ejemplo n.º 17
0
    def test_get(self):
        p1 = amcattest.create_test_project(name="testnaam", description="testdescription",
                                           insert_date='2012-01-01')

        actual = self.get(ProjectResource, id=p1.id)

        actual_results = actual.pop("results")
        self.assertEqual(len(actual_results), 1)
        actual_results = actual_results[0]

        date = actual_results.pop('insert_date')
        read_date(date)  # check valid date, not much more to check here?

        expected_results = {
            'insert_user': p1.insert_user.id,
            'description': 'testdescription',
            'name': u'testnaam',
            'guest_role': 11,
            'owner': p1.owner.id,
            'active': True,
            'id': p1.id,
            'last_visited_at': "Never",
            'favourite': False,
            "display_columns": [],
            "r_plugins_enabled": False,
        }

        expected_meta = {
            'page': 1,
            'next': None,
            'previous': None,
            'per_page': 10,
            'total': 1,
            'pages': 1,
            'echo': None,
        }

        self.assertDictsEqual(actual, expected_meta)
        self.assertDictsEqual(actual_results, expected_results)
Ejemplo n.º 18
0
Archivo: xml.py Proyecto: amcat/amcat
 def parse_file(self, file: UploadedFile, _data):
     for fields in _data:
         data = {f["path"]: f["content"] for f in fields}
         art = {}
         for field, setting in self.options['field_map'].items():
             datatype = get_property_primitive_type(field)
             value, typ = setting['value'], setting['type']
             val = data.get(value) if typ == 'field' else value
             if val:
                 if datatype is datetime.datetime and type(val) is str:
                     val = toolkit.read_date(val)
                 art[field] = val
         yield Article(**art)
Ejemplo n.º 19
0
 def parse_file(self, file: model_UploadedFile, data):
     self.ln_query, arts = data
     for data in arts:
         art = {}
         for field, setting in self.options['field_map'].items():
             datatype = get_property_primitive_type(field)
             value, typ = setting['value'], setting['type']
             val = data.get(value) if typ == 'field' else value
             if val:
                 if datatype is datetime.datetime and type(val) is str:
                     val = toolkit.read_date(val)
                 art[field] = val
         yield Article(**art)
Ejemplo n.º 20
0
 def parse_file(self, file, encoding, data):
     self.ln_query, arts = data
     for data in arts:
         art = {}
         for field, setting in self.options['field_map'].items():
             datatype = get_property_primitive_type(field)
             value, typ = setting['value'], setting['type']
             val = data.get(value) if typ == 'field' else value
             if val:
                 if datatype is datetime.datetime and type(val) is str:
                     val = toolkit.read_date(val)
                 art[field] = val
         yield Article(**art)
Ejemplo n.º 21
0
    def test_get(self):
        p1 = amcattest.create_test_project(name="testnaam",
                                           description="testdescription",
                                           insert_date='2012-01-01')

        actual = self.get(ProjectResource, id=p1.id)

        actual_results = actual.pop("results")
        self.assertEqual(len(actual_results), 1)
        actual_results = actual_results[0]

        date = actual_results.pop('insert_date')
        read_date(date)  # check valid date, not much more to check here?

        expected_results = {
            'insert_user': p1.insert_user.id,
            'description': 'testdescription',
            'name': u'testnaam',
            'guest_role': 11,
            'owner': p1.owner.id,
            'active': True,
            'id': p1.id,
            'last_visited_at': "Never",
            'favourite': False,
            "r_plugins_enabled": False,
        }

        expected_meta = {
            'page': 1,
            'next': None,
            'previous': None,
            'per_page': 10,
            'total': 1,
            'pages': 1,
            'echo': None,
        }

        self.assertDictsEqual(actual, expected_meta)
        self.assertDictsEqual(actual_results, expected_results)
Ejemplo n.º 22
0
def parse_online_article(art):
    # First, test for online articles with specific format
    blocks = re.split("\n *\n\s*", _strip_article(art))
    if len(blocks) != 6:
        return
    medium, url, datestr, headline, nwords, lead = blocks
    if not (url.startswith("http://") or url.startswith("https://")):
        return
    if lead.startswith("Bewaar lees artikel"):
        lead = lead[len("Bewaar lees artikel"):]
    
    if not re.match("(\d+) words", nwords):
        return
    date = toolkit.read_date(datestr)
    return headline.strip(), None, lead.strip(), date, medium, {"length": nwords, "url": url}
Ejemplo n.º 23
0
def create_test_article(create=True, articleset=None, deduplicate=True, **kargs):
    """Create a test article"""
    from amcat.models.article import Article

    if "date" in kargs and isinstance(kargs["date"], str):
        kargs["date"] = read_date(kargs["date"])

    if "project" not in kargs: kargs["project"] = create_test_project()
    if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1)
    if "medium" not in kargs: kargs["medium"] = create_test_medium()
    if 'headline' not in kargs: kargs['headline'] = 'test headline {} : {}'.format(_get_next_id(), uuid4())
    if 'text' not in kargs: kargs["text"] = 'test text {}'.format(_get_next_id())

    a = Article(**kargs)
    if create:
        Article.create_articles([a], articleset, deduplicate=deduplicate)
    return a
Ejemplo n.º 24
0
def parse_meta(text):
    m = re.match(r"(.*?)\s*(Nr. \d+)? vom (\d\d\.\d\d\.\d\d\d\d)( \d\d[.:]\d\d\b)?(.*)", text)
    if not m:
        raise ValueError("Cannot parse meta string {text!r}".format(**locals()))
    medium, nr, date, time, pagestr = m.groups()
    if medium.startswith('"') and medium.endswith('"'):
        medium = medium[1:-1]

    if time:
        date = date + time.replace(".", ":")
    date = toolkit.read_date(date)
    m = re.search("Seite:? (\d+)", pagestr)
    if m:
        page = int(m.group(1))
    else:
        page = None

    return medium, date, page
Ejemplo n.º 25
0
def parse_online_article(art):
    # First, test for online articles with specific format
    blocks = re.split("\n *\n\s*", _strip_article(art))
    if len(blocks) != 6:
        return
    medium, url, datestr, title, nwords, lead = blocks
    if not (url.startswith("http://") or url.startswith("https://")):
        return
    if lead.startswith("Bewaar lees artikel"):
        lead = lead[len("Bewaar lees artikel"):]

    m = re.match("(\d+) words", nwords)
    if not m:
        return
    nwords = int(m.group(1))
    date = toolkit.read_date(datestr)

    return dict(title=title.strip(), text=lead.strip(), date=date, medium=medium, length_int=nwords, url=url)
Ejemplo n.º 26
0
def create_test_article(create=True, articleset=None, check_duplicate=False, **kargs):
    """Create a test article"""
    from amcat.models.article import Article

    if "date" in kargs and isinstance(kargs["date"], basestring):
        kargs["date"] = read_date(kargs["date"])

    if "project" not in kargs: kargs["project"] = create_test_project()
    if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1)
    if "medium" not in kargs: kargs["medium"] = create_test_medium()
    if "id" not in kargs: kargs["id"] = _get_next_id()
    if 'headline' not in kargs: kargs['headline'] = 'test headline'
    if 'text' not in kargs: kargs["text"] = "\n\n".join(map(str, range(5)))

    a = Article(**kargs)
    if create:
        Article.create_articles([a], articleset, check_duplicate=check_duplicate, create_id=True)
    return a
Ejemplo n.º 27
0
def parse_meta(text):
    m = re.match(r"(.*?)\s*(Nr. \d+)? vom (\d\d\.\d\d\.\d\d\d\d)( \d\d[.:]\d\d\b)?(.*)", text)
    if not m:
        raise ValueError("Cannot parse meta string {text!r}".format(**locals()))
    medium, nr, date, time, pagestr = m.groups()
    if medium.startswith('"') and medium.endswith('"'):
        medium = medium[1:-1]

    if time:
        date = date + time.replace(".", ":")
    date = toolkit.read_date(date)
    m = re.search("Seite:? (\d+)", pagestr)
    if m:
        page = int(m.group(1))
    else:
        page = None

    return medium, date, page
Ejemplo n.º 28
0
def parse_online_article(art):
    # First, test for online articles with specific format
    blocks = re.split("\n *\n\s*", _strip_article(art))
    if len(blocks) != 6:
        return
    medium, url, datestr, title, nwords, lead = blocks
    if not (url.startswith("http://") or url.startswith("https://")):
        return
    if lead.startswith("Bewaar lees artikel"):
        lead = lead[len("Bewaar lees artikel"):]

    m = re.match("(\d+) words", nwords)
    if not m:
        return
    nwords = int(m.group(1))
    date = toolkit.read_date(datestr)

    return dict(title=title.strip(), text=lead.strip(), date=date, medium=medium, length_int=nwords, url=url)
Ejemplo n.º 29
0
def parse_doc(document):
    # We select all 'div' elements directly under '.article'
    divs = document.cssselect("* > div")

    # Check for author field. If present: remove from metadata
    # fields list
    try:
        author_field = document.cssselect(".author")[0]
    except IndexError:
        pass
    else:
        yield "author", author_field.text_content().lstrip("Von").strip()
        divs.remove(author_field)

    # Strip everything before headline
    headline_field = document.cssselect("b.deHeadline")[0].getparent()
    divs = divs[divs.index(headline_field):]

    # Parse metadata. Loop through each 'div' within an article, along with
    # its field name according to META (thus based on its position)
    for field_name, element in zip(META, divs):
        if field_name is None:
            continue

        value = element.text_content().strip()
        if field_name == "length":
            value = int(value.rstrip("words"))
        elif field_name == "date":
            value = read_date(value)
            # WVA: WAS: raise NotImplemented("Parse date here: do not use toolkit.read_date")... why?
        elif field_name == "page":
            if value.strip().isdigit():
                value = int(value.strip())
            else:
                continue

        yield field_name, value

    # Fetch text, which is
    paragraphs = [p.text_content() for p in document.cssselect("p")]
    yield "text", ("\n\n".join(paragraphs)).strip()
Ejemplo n.º 30
0
Archivo: text.py Proyecto: BBie/amcat
    def parse_document(self, file):
        if file:
            dirname, filename = os.path.split(file.name)
            filename, ext = os.path.splitext(filename)
        else:
            dirname, filename, ext = None, None, None

        metadata = dict((k, v) for (k, v) in self.options.items()
                        if k in ["headline", "project", "date", "section"])
        metadata["medium"] = Medium.get_or_create(self.options['medium'])

        if not metadata["date"]:
            datestring, filename = filename.split("_", 1)
            metadata["date"] = toolkit.read_date(datestring)

        if not metadata["headline"].strip():
            metadata["headline"] = filename

        if not metadata["headline"].strip():
            metadata["headline"] = filename

        if not metadata["section"].strip():
            metadata["section"] = dirname

        if file:
            convertors = None
            if ext.lower() == ".docx":
                convertors = [_convert_docx, _convert_doc]
            elif ext.lower() == ".doc":
                convertors = [_convert_doc, _convert_docx]
            elif ext.lower() == ".pdf":
                convertors = [_convert_pdf]

            if convertors:
                text = _convert_multiple(file, convertors)
            else:
                text = "\n".join(file.readlines())
        else:
            text = self.options['text']

        return Article(text=text, **metadata)
Ejemplo n.º 31
0
def parse_doc(document):
    # We select all 'div' elements directly under '.article'
    divs = document.cssselect("* > div")

    # Check for author field. If present: remove from metadata
    # fields list
    try:
        author_field = document.cssselect(".author")[0]
    except IndexError:
        pass
    else:
        yield "author", author_field.text_content().lstrip("Von").strip()
        divs.remove(author_field)

    # Strip everything before headline
    headline_field = document.cssselect("b.deHeadline")[0].getparent()
    divs = divs[divs.index(headline_field):]

    # Parse metadata. Loop through each 'div' within an article, along with
    # its field name according to META (thus based on its position)
    for field_name, element in zip(META, divs):
        if field_name is None:
            continue

        value = element.text_content().strip()
        if field_name == "length":
            value = int(value.rstrip("words"))
        elif field_name == "date":
            value = read_date(value)
            # WVA: WAS: raise NotImplemented("Parse date here: do not use toolkit.read_date")... why?
        elif field_name == "page":
            if value.strip().isdigit():
                value = int(value.strip())
            else:
                continue

        yield field_name, value

    # Fetch text, which is
    paragraphs = [p.text_content() for p in document.cssselect("p")]
    yield "text", ("\n\n".join(paragraphs)).strip()
Ejemplo n.º 32
0
    def parse_document(self, tupleText):
        meta, body = tupleText
        meta = meta.strip()
        meta = meta.split('\n')
        kargs = {
            'externalid': int(meta[0].split('.')[0].lstrip('?')),
            'headline': meta[0].partition('. ')[2]
        }

        medium_name, date, pagenr, length = meta[2].split(', ')
        kargs['medium'] = Medium.get_or_create(medium_name)
        kargs['date'] = read_date(date)
        kargs['pagenr'] = int(pagenr.strip('p.'))
        kargs['length'] = int(length.strip('w.'))

        body = body.split('\n')
        kargs['section'] = body[2]

        kargs['text'] = '\n'.join(body[5:])

        kargs['project'] = self.options['project']

        return Article(**kargs)
Ejemplo n.º 33
0
    def parse_document(self, tupleText):
        meta, body = tupleText
        meta = meta.strip()
        meta = meta.split('\n')
        kargs = {
            'externalid': int(meta[0].split('.')[0].lstrip('?')),
            'headline': meta[0].partition('. ')[2]
        }

        medium_name, date, pagenr, length = meta[2].split(', ')
        kargs['medium'] = Medium.get_or_create(medium_name)
        kargs['date'] = read_date(date)
        kargs['pagenr'] = int(pagenr.strip('p.'))
        kargs['length']  = int(length.strip('w.'))
        
        body = body.split('\n')
        kargs['section'] = body[2]
        
        kargs['text'] = '\n'.join(body[5:])
        
        kargs['project'] = self.options['project']
        
        return Article(**kargs)
Ejemplo n.º 34
0
def create_test_article(create=True,
                        articleset=None,
                        check_duplicate=False,
                        **kargs):
    """Create a test article"""
    from amcat.models.article import Article

    if "date" in kargs and isinstance(kargs["date"], basestring):
        kargs["date"] = read_date(kargs["date"])

    if "project" not in kargs: kargs["project"] = create_test_project()
    if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1)
    if "medium" not in kargs: kargs["medium"] = create_test_medium()
    if "id" not in kargs: kargs["id"] = _get_next_id()
    if 'headline' not in kargs: kargs['headline'] = 'test headline'
    if 'text' not in kargs: kargs["text"] = "\n\n".join(map(str, range(5)))

    a = Article(**kargs)
    if create:
        Article.create_articles([a],
                                articleset,
                                check_duplicate=check_duplicate,
                                create_id=True)
    return a
Ejemplo n.º 35
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = read_date(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():
                article.title = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = read_date(datestr)
                    if (
                            article.date - file_date
                    ).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = read_date(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.set_property("medium", medium_str)
                article.set_property("section", data[1])

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(
                    p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode(
                'latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub("=[A-Z0-9]{2}", character, article.text)

        yield article
Ejemplo n.º 36
0
def parse_article(art):
    """
    A lexis nexis article consists of five parts:
    1) a header
    2) the title and possibly a byline
    3) a block of meta fields
    4) the body
    5) a block of meta fields

    The header consists of 'centered' lines, ie starting with a whitespace character
    The title (and byline) are left justified non-marked lines before the first meta field
    The meta fields are of the form FIELDNAME: value and can contain various field names
    The body starts after either two blank lines, or if a line is not of the meta field form.
    The body ends with a 'load date', which is of form FIELDNAME: DATE ending with a four digit year
    """
    online = parse_online_article(art)
    if online:
        return online
    header, title, meta, body = [], [], [], []
    header_headline = []

    def next_is_indented(lines, skipblank=True):
        if len(lines) <= 1: return False
        if not lines[1].strip():
            if not skipblank: return False
            return next_is_indented(lines[1:])
        return lines[1].startswith(" ")

    def followed_by_date_block(lines):
        # this text is followed by a date block
        # possibly, there is another line in the first block
        # (blank line)
        #          indented date line
        #          optional second indented date line
        # (blank line)
        if len(lines) < 5: return False
        if ((not lines[1].strip()) and lines[2].startswith(" ")
                and (not lines[3].strip())):
            return True
        if ((not lines[1].strip()) and lines[2].startswith(" ")
                and lines[2].startswith(" ") and (not lines[4].strip())):
            return True
        if not lines[1].strip(): return False
        if lines[1].startswith(" "): return False
        return followed_by_date_block(lines[1:])

    def _in_header(lines):
        if not lines: return False
        if not lines[0].strip(): return True  # blank line

        # indented line spanning page width: header
        if (not lines[0].startswith(" ")
                and next_is_indented(lines, skipblank=False)
                and len(lines[0].strip()) > 75):
            return True

        # non-indented TITLE or normal line followed by indented line
        if (not lines[0].startswith(" ")) and next_is_indented(lines):
            header_headline.append(lines.pop(0))
        else:
            while (not lines[0].startswith(" ")
                   ) and followed_by_date_block(lines):
                header_headline.append(lines.pop(0))

        # check again after possible removal of header_headline
        if not lines: return False
        if not lines[0].strip(): return True  # blank line
        if lines[0].startswith(" "): return True  # indented line

    def _get_header(lines) -> dict:
        """Consume and return all lines that are indented (ie the list is changed in place)"""
        while _in_header(lines):
            line = lines.pop(0)
            line = line.strip()
            if line:
                if re.match('Copyright \d{4}', line):
                    line = line[len('Copyright xxxx'):]
                yield line

    def _get_headline(lines):
        """Return title and byline, consuming the lines"""
        headline, byline = [], []
        target = headline

        while lines:
            line = lines[0].strip()
            if RES.BODY_META.match(line):
                return None, None
            if not line:
                # they thought of something new again...
                # title\n\nbyline\n\nLENGTH:
                # so empty line is not always the end
                if (len(lines) > 4 and (not lines[2]) and lines[1]
                        and RES.BODY_META.match(lines[3])
                        and (not RES.BODY_META.match(lines[1]))):
                    target = byline
                else:
                    break
            if line.endswith(";"):
                target.append(line[:-1])
                target = byline
            else:
                target.append(line)
            del lines[0]
        return (re.sub("\s+", " ", " ".join(x)) if x else None
                for x in (headline, byline))

    def _get_meta(lines) -> dict:
        """
        Return meta key-value pairs. Stop if body start criterion is found
        (eg two blank lines or non-meta line)
        """
        while lines:
            line = lines[0].strip()
            next_line = lines[1].strip() if len(lines) >= 2 else None

            meta_match = RES.BODY_META.match(line)
            if ((not bool(line) and not bool(next_line))
                    or (line and not meta_match)):
                # either two blank lines or a non-meta line
                # indicate start of body, so end of meta
                break
            del lines[0]
            if meta_match:
                key, val = meta_match.groups()
                key = key.lower()
                key = BODY_KEYS_MAP.get(key, key)
                # multi-line meta: add following non-blank lines
                while lines and lines[0].strip():
                    val += " " + lines.pop(0)
                val = re.sub("\s+", " ", val)
                yield key, val.strip()

    def _get_body(lines):
        """split lines into body and postmatter"""
        # index of headline or end of body
        try:
            i = next(i for (i, line) in enumerate(lines)
                     if RES.BODY_END_OR_COPYRIGHT.match(line.strip()))
            return lines[:i], lines[i:]
        except StopIteration:
            return lines, []

    lines = _strip_article(art).split("\n")

    header = list(_get_header(lines))
    if not lines:
        # Something is wrong with this article, skip it
        return

    if header_headline:
        title = re.sub("\s+", " ", " ".join(header_headline)).strip()
        if ";" in title:
            title, byline = [x.strip() for x in title.split(";", 1)]
        else:
            byline = None
        if re.match("[A-Z]+:", title):
            title = title.split(":", 1)[1]
    else:
        title, byline = _get_headline(lines)
    meta = dict(_get_meta(lines))
    if title is None:
        if 'title' in meta:
            title = meta.pop('title')
        elif 'kop' in meta:
            title = meta.pop('kop')

    body, lines = _get_body(lines)

    meta.update(dict(_get_meta(lines)))

    def _get_source(lines, i):
        source = lines[0 if i > 0 else 1]
        if source.strip() in ("PCM Uitgevers B.V.", "De Persgroep Nederland BV"
                              ) and i > 2 and lines[i - 1].strip():
            source = lines[i - 1]
        return source

    date, dateline, source = None, None, None
    for i, line in enumerate(header):
        if _is_date(line):
            date = line
            dateline = i
            source = _get_source(header, i)
            break

    if date is None:  # try looking for only month - year notation by preprending a 1
        for i, line in enumerate(header):
            line = "1 {line}".format(**locals())
            if _is_date(line):
                date = line
                source = _get_source(header, i)
    if date is None:  # try looking for season names
        #TODO: Hack, reimplement more general!
        for i, line in enumerate(header):
            if line.strip() == "Winter 2008/2009":
                date = "2009-01-01"
                source = _get_source(header, i)

    def find_re_in(pattern, lines):
        for line in lines:
            m = re.search(pattern, line)
            if m: return m

    if date is None:
        yearmatch = find_re_in("(.*)(\d{4})$", header)
        if yearmatch:
            month, year = yearmatch.groups()
            month = MONTHS.get(month.replace(",", "").strip().lower(), 1)
            date = "{year}-{month:02}-01".format(**locals())
            source = header[0]
            # this is probably a journal, let's see if we can find an issue
            issuematch = find_re_in("[-\d]+[^\d]+\d+", header)
            if issuematch:
                meta['issue'] = issuematch.group(0)

        elif [x.strip()
              for x in header] in (["India Today"], ["Business Today"]):
            date = meta.pop("load-date")
            source = header[0]
        else:
            raise ParseError(
                "Couldn't find date in header: {header!r}\n{art!r}".format(
                    **locals()))

    date = toolkit.read_date(date)
    if dateline is not None and len(header) > dateline + 1:
        # next line might contain time
        timeline = header[dateline + 1]
        m = re.search(r"\b\d?\d:\d\d\s(PM\b)?", timeline)
        if m and date.time().isoformat() == '00:00:00':
            time = toolkit.read_date("1990-01-01 {}".format(m.group(0)))
            datestr = " ".join([date.isoformat()[:10], m.group(0)])
            date = toolkit.read_date(datestr)

    m = re.match("copyright\s\xa9?\s?(\d{4})?(.*)", source, re.I)
    if m:
        source = m.group(2)
    source = source.strip()

    text = "\n".join(body).strip()

    if 'graphic' in meta and (not text):
        text = meta.pop('graphic')

    if title is None:
        if 'headline' in meta and 'title' not in meta:
            meta['title'] = meta.pop('headline')
        if 'title' in meta:
            title = re.sub("\s+", " ", meta.pop('title')).strip()
            if ";" in title and not byline:
                title, byline = [x.strip() for x in title.split(";", 1)]
        else:
            title = "No title found!"

    if 'byline' in meta:
        if byline:
            title += "; %s" % byline
        byline = meta.pop('byline')

    if 'length' in meta:
        meta['length_int'] = meta.pop('length')
    if 'length_int' in meta:
        meta['length_int'] = int(meta['length_int'].split()[0])
    meta.update(
        dict(title=title.strip(),
             byline=byline,
             text=text,
             date=date,
             medium=source))
    meta = {k: v for (k, v) in meta.items() if v}
    return meta
Ejemplo n.º 37
0
def parse_page(doc_elements):
    """Parses an APA page given in a list of Etree elements."""
    doc, elements = doc_elements
    elements = [e for e in elements if not isinstance(e, lxml.html.HtmlComment)]
    element_set = set(elements)
    result = try_alternative(elements)
    if result is not None:
        return result

    source_tags = doc.cssselect('meta[name=author]')
    if source_tags:
        source = source_tags[0].get('content')
    else:
        source = None

    headline = set(get_descendants(doc.cssselect("b"))) & element_set
    meta = (set(get_descendants(doc.cssselect("i"))) & element_set) - headline
    text = element_set - (headline | meta)
    headline = sorted(get_roots(headline), key=lambda e: elements.index(e))

    # Some formats don't have a bold headline. Instead, the first line is the headline.
    first_line_is_headline = False
    if not headline and source == "AOMweb":
        first_line_is_headline = True

    if not headline and not first_line_is_headline:
        raise ApaError("No possible headlines found.")

    remove_tree(meta, ["b"])
    remove_tree(text, ["b", "i"])

    # Some text in italics is no metadata. We only use text before headline elements
    # for metadata.
    if not first_line_is_headline:
        lesser_than_headline = lambda e: elements.index(e) <= elements.index(headline[0])
        meta = get_nonempty(filter(lesser_than_headline, meta))
    else:
        meta = get_nonempty(meta)

    # Parse metadata
    metadata = {}
    for el in list(meta):
        if get_metadata(metadata, el):
            meta.remove(el)

    if meta:
        metadata["byline"] = " - ".join(m.text for m in meta)

    # Convert date properties to datetime object
    year, month, day = metadata["year"], metadata["month"], metadata["day"]
    hour, minute = metadata.get("hour"), metadata.get("minute")

    datestring = "{day}.{month}.{year}"
    if hour is not None and minute is not None:
        datestring += ", {hour}:{minute}"

    metadata["date"] = read_date(datestring.format(**locals()))
    for prop in ("year", "month", "day", "hour", "minute"):
        if prop in metadata:
            del metadata[prop]

    # Clean data and get headline
    metadata["medium"] = metadata.get("medium", "APA - Unknown").strip().strip('"')

    if first_line_is_headline:
        medium = metadata["medium"].strip()
    else:
        medium, headline = metadata["medium"].strip(), "".join(["".join(e.itertext()) for e in headline]).strip()

        if medium in headline:
            headline = headline.split("-", medium.count("-") + 1)[-1]

    if "section" in metadata and metadata["section"] is None:
        del metadata["section"]

    # Get text. Since ordering is lost in sets, restore original order of elements
    text = "".join(get_text(sorted(text, key=lambda e: elements.index(e)))).strip()

    if first_line_is_headline:
        headline, text = re.split("\n *\n", text, 1)

    metadata["title"] = headline
    metadata["length"] = sum(1 for w in RE_NONWORD.split(text) if w)

    return metadata, text
Ejemplo n.º 38
0
 def parse_date(d):
     if isinstance(d, list) and len(d) == 1:
         d = d[0]
     if isinstance(d, str):
         d = toolkit.read_date(d)
     return d.isoformat()
Ejemplo n.º 39
0
 def parse_date(d):
     if isinstance(d, list) and len(d) == 1:
         d = d[0]
     if isinstance(d, str):
         d = toolkit.read_date(d)
     return d.isoformat()
Ejemplo n.º 40
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = read_date(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():
                article.title = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = read_date(datestr)
                    if (
                                article.date - file_date).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = read_date(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.set_property("medium", medium_str)
                article.set_property("section", data[1])

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode('latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub(
            "=[A-Z0-9]{2}",
            character,
            article.text)

        yield article
Ejemplo n.º 41
0
def parse_article(art):
    """
    A lexis nexis article consists of five parts:
    1) a header
    2) the title and possibly a byline
    3) a block of meta fields
    4) the body
    5) a block of meta fields

    The header consists of 'centered' lines, ie starting with a whitespace character
    The title (and byline) are left justified non-marked lines before the first meta field
    The meta fields are of the form FIELDNAME: value and can contain various field names
    The body starts after either two blank lines, or if a line is not of the meta field form.
    The body ends with a 'load date', which is of form FIELDNAME: DATE ending with a four digit year
    """
    online = parse_online_article(art)
    if online:
        return online
    header, title, meta, body = [], [], [], []
    header_headline = []
    metadata_lang = None

    def next_is_indented(lines, skipblank=True):
        if len(lines) <= 1: return False
        if not lines[1].strip():
            if not skipblank: return False
            return next_is_indented(lines[1:])
        return lines[1].startswith(" ")

    def followed_by_date_block(lines):
        # this text is followed by a date block
        # possibly, there is another line in the first block
        # (blank line)
        #          indented date line
        #          optional second indented date line
        # (blank line)
        if len(lines) < 5: return False
        if ((not lines[1].strip()) and
                lines[2].startswith(" ") and
                (not lines[3].strip())):
            return True
        if ((not lines[1].strip()) and
                lines[2].startswith(" ") and
                lines[2].startswith(" ") and
                (not lines[4].strip())):
            return True
        if not lines[1].strip(): return False
        if lines[1].startswith(" "): return False
        return followed_by_date_block(lines[1:])

    def _in_header(lines):
        if not lines: return False
        if not lines[0].strip(): return True  # blank line

        # indented line spanning page width: header
        if (not lines[0].startswith(" ")
            and next_is_indented(lines, skipblank=False)
            and len(lines[0].strip()) > 75):
            return True

        # non-indented TITLE or normal line followed by indented line
        if (not lines[0].startswith(" ")) and next_is_indented(lines):
            header_headline.append(lines.pop(0))
        else:
            while (not lines[0].startswith(" ")) and followed_by_date_block(lines):
                header_headline.append(lines.pop(0))

        # check again after possible removal of header_headline
        if not lines: return False
        if not lines[0].strip(): return True  # blank line
        if lines[0].startswith(" "): return True  # indented line

    def _get_header(lines) -> dict:
        """Consume and return all lines that are indented (ie the list is changed in place)"""
        while _in_header(lines):
            line = lines.pop(0)
            line = line.strip()
            if line:
                if re.match('Copyright \d{4}', line):
                    line = line[len('Copyright xxxx'):]
                yield line

    def _get_headline(lines):
        """Return title and byline, consuming the lines"""
        headline, byline = [], []
        target = headline

        while lines:
            line = lines[0].strip()
            if RES.BODY_META.match(line):
                return None, None
            if not line:
                # they thought of something new again...
                # title\n\nbyline\n\nLENGTH:
                # so empty line is not always the end
                if (len(lines) > 4 and (not lines[2]) and lines[1]
                    and RES.BODY_META.match(lines[3]) and (not RES.BODY_META.match(lines[1]))):
                    target = byline
                else:
                    break
            if line.endswith(";"):
                target.append(line[:-1])
                target = byline
            else:
                target.append(line)
            del lines[0]
        return (re.sub("\s+", " ", " ".join(x)) if x else None
                for x in (headline, byline))

    def _get_meta(lines, after_body=False) -> Iterable[Tuple[str, str, str]]:
        """
        Return meta key-value pairs. Stop if body start criterion is found
        (eg two blank lines or non-meta line)
        """
        nonlocal metadata_lang
        while lines:
            line = lines[0].strip()
            next_line = lines[1].strip() if len(lines) >= 2 else None

            meta_match = RES.BODY_META.match(line)
            if ((not bool(line) and not bool(next_line))
                or (line and not meta_match)):
                # either two blank lines or a non-meta line
                # indicate start of body, so end of meta
                break
            if meta_match and not after_body:
                # if the key is not known, and the next non-empty line is body, treat this line as part of body
                key, val = meta_match.groups()
                if val.strip() and not key.lower() in WELL_KNOWN_BODY_KEYS:
                    def next_block(lines):
                        found_blank = False
                        for l in lines:
                            l = l.strip()
                            if not l:
                                found_blank = True
                            elif found_blank:
                                return l
                    next_line = next_block(lines)
                    if next_line and not RES.BODY_META.match(next_line):
                        break
            del lines[0]
            if meta_match:
                key, val = meta_match.groups()
                orig_key = key
                key = key.lower()

                # detect language before mapping to English
                if metadata_lang is None and key in METADATA_LANGUAGE_MAP:
                    metadata_lang = METADATA_LANGUAGE_MAP[key]

                key = BODY_KEYS_MAP.get(key, key)
                # multi-line meta: add following non-blank lines
                while lines and lines[0].strip():
                    val += " " + lines.pop(0)
                val = re.sub("\s+", " ", val)
                yield orig_key, key, val.strip()

    def _get_body(lines):
        """split lines into body and postmatter"""
        # index of headline or end of body
        try:
            i = next(i for (i, line) in enumerate(lines) if RES.BODY_END_OR_COPYRIGHT.match(line.strip()))
            return lines[:i], lines[i:]
        except StopIteration:
            return lines, []

    lines = _strip_article(art).split("\n")

    header = list(_get_header(lines))
    if not lines:
        # Something is wrong with this article, skip it
        return

    if header_headline:
        title = re.sub("\s+", " ", " ".join(header_headline)).strip()
        if ";" in title:
            title, byline = [x.strip() for x in title.split(";", 1)]
        else:
            byline = None
        if re.match("[A-Z]+:", title):
            title = title.split(":", 1)[1]
    else:
        title, byline = _get_headline(lines)

    head_meta_fields = list(((ok, k), (k, v)) for ok, k, v in _get_meta(lines))
    orig_keys, meta = zip(*head_meta_fields) if head_meta_fields else ((), ())
    orig_keys = OrderedDict(orig_keys)
    meta = dict(meta)

    if title is None:
        if 'title' in meta:
            title = meta.pop('title')
        elif 'kop' in meta:
            title = meta.pop('kop')

    body, lines = _get_body(lines)

    meta.update({k: v for _, k, v in _get_meta(lines, after_body=True)})

    def _get_source(lines, i):
        source = lines[0 if i > 0 else 1]
        if source.strip() in ("PCM Uitgevers B.V.", "De Persgroep Nederland BV") and i > 2 and lines[i - 1].strip():
            source = lines[i - 1]
        return source

    def _get_date_languages(meta, metadata_lang, body):
        article_langs = [lang.lower().strip()
                         for lang in RES.SPLIT_LANGUAGES.split(meta.get('language', ""))
                         if lang != ""]

        if metadata_lang is None:
            log.debug("Failed to detect metadata language. Falling back to defaults")
            return None

        if not article_langs:
            # failed to guess language, fall back to default
            return None

        article_langs.append(metadata_lang)
        return tuple(article_langs)

    lang_pool = _get_date_languages(meta, metadata_lang, body)

    date, dateline, source = None, None, None

    for i, line in enumerate(header):
        if _is_date(line, language_pool=lang_pool):
            date = line
            dateline = i
            source = _get_source(header, i)
            break

    if date is None:  # try looking for only month - year notation by preprending a 1
        for i, line in enumerate(header):
            line = "1 {line}".format(**locals())
            if _is_date(line, language_pool=lang_pool):
                date = line
                source = _get_source(header, i)
    if date is None:  # try looking for season names
        # TODO: Hack, reimplement more general!
        for i, line in enumerate(header):
            if line.strip() == "Winter 2008/2009":
                date = "2009-01-01"
                source = _get_source(header, i)

    def find_re_in(pattern, lines):
        for line in lines:
            m = re.search(pattern, line)
            if m: return m

    if date is None:
        yearmatch = find_re_in("(.*)(\d{4})$", header)
        if yearmatch:
            month, year = yearmatch.groups()
            month = MONTHS.get(month.replace(",", "").strip().lower(), 1)
            date = "{year}-{month:02}-01".format(**locals())
            source = header[0]
            # this is probably a journal, let's see if we can find an issue
            issuematch = find_re_in("[-\d]+[^\d]+\d+", header)
            if issuematch:
                meta['issue'] = issuematch.group(0)

        elif [x.strip() for x in header] in (["India Today"], ["Business Today"]):
            date = meta.pop("load-date")
            source = header[0]
        else:
            raise ParseError("Couldn't find date in header: {header!r}\n{art!r}".format(**locals()))

    date = toolkit.read_date(date)
    if dateline is not None and len(header) > dateline + 1:
        # next line might contain time
        timeline = header[dateline + 1]
        m = re.search(r"\b\d?\d:\d\d\s(PM\b)?", timeline)
        if m and date.time().isoformat() == '00:00:00':
            time = toolkit.read_date("1990-01-01 {}".format(m.group(0)))
            datestr = " ".join([date.isoformat()[:10], m.group(0)])
            date = toolkit.read_date(datestr)

    m = re.match("copyright\s\xa9?\s?(\d{4})?(.*)", source, re.I)
    if m:
        source = m.group(2)
    source = source.strip()

    text = "\n".join(body).strip()

    if 'graphic' in meta and (not text):
        text = meta.pop('graphic')

    if title is None:
        if 'headline' in meta and 'title' not in meta:
            meta['title'] = meta.pop('headline')
        if 'title' in meta:
            title = re.sub("\s+", " ", meta.pop('title')).strip()
            if ";" in title and not byline:
                title, byline = [x.strip() for x in title.split(";", 1)]
        else:
            # test if title was mistakenly parsed as a meta fields.
            title_mistake = next(iter(orig_keys.items()))
            if title_mistake[0].lower() not in WELL_KNOWN_BODY_KEYS:
                val = meta.pop(title_mistake[1])
                title = "{}: {}".format(title_mistake[1], val)
            else:
                title = "No title found!"

    if 'byline' in meta:
        if byline:
            title += "; %s" % byline
        byline = meta.pop('byline')

    if 'length' in meta:
        meta['length_int'] = meta.pop('length')
    if 'length_int' in meta:
        meta['length_int'] = int(meta['length_int'].split()[0])
    meta.update(dict(title=title.strip(), byline=byline, text=text, date=date, medium=source))
    meta = {k: v for (k, v) in meta.items() if v}
    return meta