def getarticle(self, headline, lines):
        article = Article(headline = headline)
        text = ""
        for line in lines[2:]:
            if len(line) > 2:
                text += "\n" + line

        text = text.replace("-\n","")
        text = text.replace("  "," ")
        text = text.replace("\n"," ")

        article.text = text
        date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})")
        result = date_pattern.search(lines[1])
        article.date = date(
            int(result.group(3)),
            int(result.group(2)),
            int(result.group(1)))
        pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)")
        result = pagenum_pattern.search(lines[1])
        if result:
            
            article.pagenr = int(result.group(1))

        for h, medium in self.index:
            if article.headline.lower().strip() in h.lower().strip():
                article.medium = self.create_medium(medium)

        return article
Esempio n. 2
0
def create_test_article(create=True, articleset=None, deduplicate=True, properties=None, **kargs):
    """Create a test article"""
    from amcat.models.article import Article

    # Get static properties
    title = kargs.pop("title", "test title {}: {}".format(_get_next_id(), uuid4()))
    date = kargs.pop("date", datetime.datetime.now())
    url = kargs.pop("url", "http://example.com")
    text = kargs.pop("text", "Lorum Ipsum: {}".format(_get_next_id()))
    project = kargs.pop("project", articleset.project if articleset is not None else create_test_project())
    parent_hash = kargs.pop("parent_hash", None)
    hash = kargs.pop("hash", None)

    # Caller is allowed to pas date as string
    if isinstance(date, str):
        date = _parse_date(date)

    a = Article(title=title, date=date, url=url, text=text, project=project, parent_hash=parent_hash, hash=hash)

    if properties:
        for propname, value in properties.items():
            if get_property_primitive_type(propname) == datetime.datetime and isinstance(value, str):
                properties[propname] = _parse_date(value)
        a.properties.update(properties)

    if create:
        Article.create_articles([a], articleset, deduplicate=deduplicate)

    return a
Esempio n. 3
0
def create_test_article(create=True, articleset=None, deduplicate=True, properties=None, project=None, **kargs):
    """Create a test article"""
    from amcat.models.article import Article

    # Get static properties
    title = kargs.pop("title", "test title {}: {}".format(_get_next_id(), uuid4()))
    date = kargs.pop("date", datetime.datetime.now())
    url = kargs.pop("url", "http://example.com")
    text = kargs.pop("text", "Lorum Ipsum: {}".format(_get_next_id()))
    if project is None:
        project = articleset.project if articleset is not None else create_test_project()
    parent_hash = kargs.pop("parent_hash", None)
    hash = kargs.pop("hash", None)

    # Caller is allowed to pas date as string
    if isinstance(date, str):
        date = _parse_date(date)

    a = Article(title=title, date=date, url=url, text=text, project=project, parent_hash=parent_hash, hash=hash)

    if properties:
        for propname, value in properties.items():
            if get_property_primitive_type(propname) == datetime.datetime and isinstance(value, str):
                properties[propname] = _parse_date(value)
        a.properties.update(properties)

    if create:
        Article.create_articles([a], articleset, deduplicate=deduplicate)

    return a
Esempio n. 4
0
    def parse_item(self, item):
        #item: a list of html tags
        article = Article(metastring={})
        article.text = self._parse_text(item)

        for tag in item:
            if tag.tag == "h2":
                article.headline = tag.text
            elif tag.tag == "i":
                article = self.parse_dateline(tag.text_content(), article)

        return article
Esempio n. 5
0
def create_test_set(articles=0, **kargs):
    """Create a test (Article) set"""
    from amcat.models.articleset import ArticleSet, Article
    if "name" not in kargs: kargs["name"] = "testset_%i" % len(ArticleSet.objects.all())
    if "project" not in kargs: kargs["project"] = create_test_project()
    if "id" not in kargs: kargs["id"] = _get_next_id()
    s = ArticleSet.objects.create(**kargs)
    if type(articles) == int:
        articles = [create_test_article(create=False) for _x in range(articles)]
        Article.create_articles(articles, articleset=s, check_duplicate=False, create_id=True)
    elif articles:
        s.add_articles(articles)
    return s
Esempio n. 6
0
def create_test_article(create=True, articleset=None, check_duplicate=False, **kargs):
    """Create a test article"""
    from amcat.models.article import Article
    if "project" not in kargs: kargs["project"] = create_test_project()
    if "date" not in kargs: kargs["date"] = "2000-01-01"
    if "medium" not in kargs: kargs["medium"] = create_test_medium()
    if "id" not in kargs: kargs["id"] = _get_next_id()
    if 'headline' not in kargs: kargs['headline'] = 'test headline'

    a = Article(**kargs)
    if create:
        Article.create_articles([a], articleset, check_duplicate=check_duplicate)
    return a
Esempio n. 7
0
def create_test_set(articles=0, **kargs):
    """Create a test (Article) set"""
    from amcat.models.articleset import ArticleSet, Article
    if "name" not in kargs: kargs["name"] = "testset_%i" % len(ArticleSet.objects.all())
    if "project" not in kargs: kargs["project"] = create_test_project()
    if "id" not in kargs: kargs["id"] = _get_next_id()
    s = ArticleSet.objects.create(**kargs)
    if type(articles) == int:
        articles = [create_test_article(create=False) for _x in range(articles)]
        Article.create_articles(articles, articleset=s, check_duplicate=False)
    elif articles:
        s.add_articles(articles)
    return s
Esempio n. 8
0
def create_test_article(create=True, articleset=None, check_duplicate=False, **kargs):
    """Create a test article"""
    from amcat.models.article import Article
    if "project" not in kargs: kargs["project"] = create_test_project()
    if "date" not in kargs: kargs["date"] = "2000-01-01"
    if "medium" not in kargs: kargs["medium"] = create_test_medium()
    if "id" not in kargs: kargs["id"] = _get_next_id()
    if 'headline' not in kargs: kargs['headline'] = 'test headline'

    a = Article(**kargs)
    if create:
        Article.create_articles([a], articleset, check_duplicate=check_duplicate, create_id=True)
    return a
Esempio n. 9
0
    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [
                div for div in _html.cssselect("#sort div")
                if "sort_" in div.get('id')
            ]

        for div in divs:
            article = Article(metastring={})
            article.metastring['html'] = div
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0]
            articlepage = div.cssselect("#articlePage")
            if articlepage:
                article.pagenr, article.section = self.get_pagenum(
                    articlepage[0].text)

            article.medium = self.get_medium(
                div.cssselect("#sourceTitle")[0].text)
            date_str = div.cssselect("#articleDate")[0].text
            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error(
                    "parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Esempio n. 10
0
 def parse_item(self, item):
     #item: a list of html tags
     article = Article(metastring={})
     article.text = self._parse_text(item)
     for tag in item:
         if tag.tag == "h2":
             if tag.text:
                 article.headline = tag.text
             else:
                 article.headline = tag.cssselect("span")[0].text_content()
         elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")):
             article = self.parse_dateline(tag.text_content(), article)
     if not article.headline:
         raise Exception("Article has no headline")
     return article
Esempio n. 11
0
 def parse_item(self, item):
     #item: a list of html tags
     article = Article(metastring={})
     article.text = self._parse_text(item)
     for tag in item:
         if tag.tag == "h2":
             if tag.text:
                 article.headline = tag.text
             else:
                 article.headline = tag.cssselect("span")[0].text_content()
         elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")):
             article = self.parse_dateline(tag.text_content(), article)
     if not article.headline:
         raise Exception("Article has no headline")
     return article
Esempio n. 12
0
    def _scrape_unit(self, fn):
        dest = StringIO()
        with self.ftp() as ftp:
            ftp.retrbinary(b'RETR %s' % (fn.encode('latin-1')), dest.write)
        body = STLtoText(dest.getvalue())
        body = body.decode('latin-1', 'ignore').strip().lstrip('888').strip()
        title = fn.split('/')[-1]
        medium = title.split('-')[-1].split('.stl')[0].strip().lower()
        date = getDate(title)

        if medium == 'nos journaal' and int(format(date, '%H')) == 20 and int(
                format(date, '%M')) == 0:
            medium = 'nos journaal 20:00'
        med = Medium.get_or_create(medium)
        if med.id in mediadict:
            print("saving %s as %s" % (med.id, mediadict[med.id]))
            med = Medium.objects.get(id=mediadict[med.id])

        headline = "%s (%s)" % (medium, fn.replace('.stl', '').strip())
        art = Article(headline=headline,
                      text=body,
                      medium=med,
                      date=date,
                      url=fn)
        yield art
Esempio n. 13
0
    def parse_document(self, file):
        dirname, filename = os.path.split(file.name)
        filename, ext = os.path.splitext(filename)

            
        
        metadata = dict((k, v) for (k,v) in self.options.items()
                        if k in ["medium", "headline", "project", "date", "section"])
        if not metadata["date"]:
            datestring, filename = filename.split("_", 1)
            metadata["date"] = toolkit.read_date(datestring)
            
        if not metadata["headline"].strip():
            metadata["headline"] = filename

        if not metadata["headline"].strip():
            metadata["headline"] = filename
            
        if not metadata["section"].strip():
            metadata["section"] = dirname

        convertors = None
        if ext.lower() == ".docx":
            convertors = [_convert_docx, _convert_doc]
        elif ext.lower() == ".doc":
            convertors = [_convert_doc, _convert_docx]

        if convertors:
            text = _convert_multiple(file, convertors)
        else:
            text = file.text
        return Article(text=text, **metadata)
Esempio n. 14
0
    def parse_document(self, row):
        kargs = dict(medium=self._medium)
        for fieldname in FIELDS:
            csvfield = self.options[fieldname]
            if not csvfield: continue
            val = row[csvfield]
            if val.strip():
                if fieldname in PARSERS:
                    val = PARSERS[fieldname](val)
            elif is_nullable(fieldname):
                val = None
            else:
                val = val.strip()

            kargs[fieldname] = val

        # In case medium wasn't defined in csv
        medium = self._medium
        if medium is not None:
            kargs["medium"] = medium

        if self.parent_field:
            doc_id = kargs.get(self.id_field)
            parent_id = kargs.pop(self.parent_field)
            if parent_id:
                self.parents[doc_id] = parent_id

        article = Article(**kargs)
        if self.parent_field:
            self.articles[doc_id] = article

        return article
 def parse_item(self, item):
     #item: a list of html tags
     article = Article(metastring = {})
     for tag in item:
         if tag.tag in ("p","div"):
             if not (hasattr(article,'text') or article.text):
                 article.text.append(tag)
             else:
                 article.text = [tag]
         elif tag.tag == "h2":
             article.headline = tag.text
         elif tag.tag == "i":
             article = self.parse_dateline(tag.text, article)
     #process html
     article.text = "\n".join([html2text(html.tostring(bit)) for bit in article.text])
     return article
Esempio n. 16
0
    def add_articles(self, articles, add_to_index=True, monitor=ProgressMonitor()):
        """
        Add the given articles to this articleset. Implementation is exists of three parts:

          1. Adding ArticleSetArticle objects
          2. Adding CodedArticle objects
          3. Updating index

        @param articles: articles to be removed
        @type articles: iterable with indexing of integers or Article objects

        @param add_to_index: notify elasticsearch of changes
        @type add_to_index: bool
        """
        articles = {(art if type(art) is int else art.id) for art in articles}
        to_add = articles - self.get_article_ids()
        # Only use articles that exist
        to_add = list(Article.exists(to_add))

        monitor.update(10, "{n} articles need to be added".format(n=len(to_add)))
        ArticleSetArticle.objects.bulk_create(
            [ArticleSetArticle(articleset=self, article_id=artid) for artid in to_add],
            batch_size=100,
        )

        monitor.update(20, "{n} articleset articles added to database, adding to codingjobs".format(n=len(to_add)))
        cjarts = [CodedArticle(codingjob=c, article_id=a) for c, a in itertools.product(self.codingjob_set.all(), to_add)]
        CodedArticle.objects.bulk_create(cjarts)

        monitor.update(30, "{n} articles added to codingjobs, adding to index".format(n=len(cjarts)))
        if add_to_index:
            amcates.ES().add_to_set(self.id, to_add, monitor=monitor)
Esempio n. 17
0
    def add_articles(self,
                     article_ids,
                     add_to_index=True,
                     monitor=NullMonitor()):
        """
        Add the given articles to this articleset. Implementation is exists of three parts:

          1. Adding ArticleSetArticle objects
          2. Adding CodedArticle objects
          3. Updating index

        @param article_ids: articles to be removed
        @type article_ids: iterable with indexing of integers or Article objects

        @param add_to_index: notify elasticsearch of changes
        @type add_to_index: bool
        """
        monitor = monitor.submonitor(total=4)

        article_ids = {(art if type(art) is int else art.id)
                       for art in article_ids}

        # Only use articles that exist
        to_add = article_ids - self.get_article_ids()
        to_add = list(Article.exists(to_add))

        monitor.update(message="Adding {n} articles to {aset}..".format(
            n=len(to_add), aset=self))
        ArticleSetArticle.objects.bulk_create(
            [
                ArticleSetArticle(articleset=self, article_id=artid)
                for artid in to_add
            ],
            batch_size=100,
        )

        monitor.update(
            message=
            "{n} articleset articles added to database, adding to codingjobs.."
            .format(n=len(to_add)))
        cjarts = [
            CodedArticle(codingjob=c, article_id=a)
            for c, a in itertools.product(self.codingjob_set.all(), to_add)
        ]
        CodedArticle.objects.bulk_create(cjarts)

        if add_to_index:
            monitor.update(
                message="{n} articles added to codingjobs, adding to index".
                format(n=len(cjarts)))
            es = ES()
            es.add_to_set(self.id, to_add, monitor=monitor)
            es.refresh()  # We need to flush, or setting cache will fail
        else:
            monitor.update(2)

        # Add to property cache
        properties = ES().get_used_properties(article_ids=to_add)
        self._add_to_property_cache(properties)
    def _scrape_unit(self, url):
        try:
            xml = self.getdoc(url)
        except:
            log.warn("COULD NOT FIND XML FOR %s" % url)
            return

        url = url.replace('.xml', '.html')
        metadict = self.getMetaDict(xml, printit=False)

        if len(metadict) == 0:
            log.warn(
                "NO METADATA FOR %s. SKIPPING ARTICLE (to be retrieved after officiele bekendmakingen finalizes it)"
                % url)
            return

        section = self.safeMetaGet(metadict, 'OVERHEID.category')
        document_id = metadict['DC.identifier']
        if document_id.count('-') == 1:
            #kamer = 'NA'
            if 'tweede' in metadict['DC.creator'].lower(): kamer = 'tk'
            if 'eerste' in metadict['DC.creator'].lower(): kamer = 'ek'
            document_id = document_id.replace('-', '-%s-' % kamer)

        author = self.safeMetaGet(metadict, 'OVERHEIDop.ontvanger')
        try:
            archieftype = metadict['OVERHEIDop.ArchiefType']
        except:
            archieftype = metadict['DC.type']
        aanleiding = metadict['DC.title']
        try:
            vraagnummer = metadict['OVERHEIDop.vraagnummer'].strip()
        except:
            vraagnummer = self.safeMetaGet(metadict,
                                           'OVERHEIDop.vraagNummer').strip()

        headline = ("%s | %s - %s" %
                    (document_id, archieftype, vraagnummer)).strip()
        try:
            datestring = adhocDateFix(metadict['OVERHEIDop.datumOntvangst'])
        except:
            datestring = adhocDateFix(metadict['OVERHEIDop.datumIndiening'])
            headline += " (publicatiedatum)"

        try:
            date = datetime.datetime.strptime(datestring, '%Y-%m-%d')
        except:
            date = datetime.datetime.strptime(datestring, '%d-%m-%Y')

        body = "%s\n\n%s" % (aanleiding, self.getBody(xml))

        #print('--------------\n', headline, '\n', body, '\n\n')
        print("SAVING: %s" % url)
        yield Article(headline=headline,
                      byline=vraagnummer,
                      text=body,
                      date=date,
                      section=section,
                      url=url)
Esempio n. 19
0
    def create_article(self):
        """Convert the document object into an article"""
        art = Article()

        # All properties in _ARTICLES_PROPS are set on a new Article,
        # else in Article.metastring.
        _metastring = dict()
        for prop, value in self.getprops().items():
            value = self._convert(value)
            if prop in _ARTICLE_PROPS:
                setattr(art, prop, value)
            else:
                _metastring[prop] = value

        art.metastring = str(_metastring)
        self.article = art
        return art
Esempio n. 20
0
    def create_article(self):
        """Convert the document object into an article"""
        art = Article()

        # All properties in _ARTICLES_PROPS are set on a new Article,
        # else in Article.metastring.
        _metastring = dict()
        for prop, value in self.getprops().items():
            value = self._convert(value)
            if prop in _ARTICLE_PROPS:
                setattr(art, prop, value)
            else:
                _metastring[prop] = value

        art.metastring = str(_metastring)
        self.article = art
        return art
Esempio n. 21
0
def create_test_article(create=True, articleset=None, deduplicate=True, **kargs):
    """Create a test article"""
    from amcat.models.article import Article

    if "date" in kargs and isinstance(kargs["date"], str):
        kargs["date"] = read_date(kargs["date"])

    if "project" not in kargs: kargs["project"] = create_test_project()
    if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1)
    if "medium" not in kargs: kargs["medium"] = create_test_medium()
    if 'headline' not in kargs: kargs['headline'] = 'test headline {} : {}'.format(_get_next_id(), uuid4())
    if 'text' not in kargs: kargs["text"] = 'test text {}'.format(_get_next_id())

    a = Article(**kargs)
    if create:
        Article.create_articles([a], articleset, deduplicate=deduplicate)
    return a
Esempio n. 22
0
def body_to_article(headline, byline, text, date, source, meta):
    """
    Create an Article-object based on given parameters. It raises an
    error (Medium.DoesNotExist) when the given source does not have
    an entry in the database.

    @param headline: headline of new Article-object
    @type headline: str

    @param byline: byline for new Article
    @type byline: NoneType, str

    @param text: text for new Article
    @type text: str

    @param date: date(time) for new Article
    @type date: datetime.date, datetime.datetime

    @param source: medium-label for new Article
    @type source: str

    @param meta: object containing all sorts of meta-information, most of
                 it suitable for metastring. However, some information
                 (author, length) will be extracted.
    @type meta: dictionary

    @return Article-object

    """
    log.debug("Creating article object for {headline!r}".format(**locals()))

    art = Article(headline=headline, byline=byline, text=text, date=date)

    art.medium = Medium.get_or_create(source)

    # Author / Section
    meta = meta.copy()
    art.author = meta.pop('author', None)
    art.section = meta.pop('section', None)
    if 'length' in meta:
        art.length = int(meta.pop('length').split()[0])
    else:
        art.length = art.text.count(" ")
    if 'url' in meta:
        art.url = meta.pop('url')
        art.url = re.sub("\s+", "", art.url)

    art.metastring = str(meta)

    return art
Esempio n. 23
0
    def getarticle(self, headline, lines):
        article = Article(headline=headline)
        text = ""
        for line in lines[2:]:
            if len(line) > 2:
                text += "\n" + line

        text = text.replace("-\n", "")
        text = text.replace("  ", " ")
        text = text.replace("\n", " ")

        article.text = text
        date_pattern = re.compile("([0-9]{2,2})\-([0-9]{2,2})\-([0-9]{4,4})")
        result = date_pattern.search(lines[1])
        article.date = date(int(result.group(3)), int(result.group(2)),
                            int(result.group(1)))
        pagenum_pattern = re.compile("\(p.([0-9]+)([0-9\-]+)?\)")
        result = pagenum_pattern.search(lines[1])
        if result:

            article.pagenr = int(result.group(1))

        for h, medium in self.index:
            if article.headline.lower().strip() in h.lower().strip():
                article.set_property("medium", self.get_medium(medium))

        return article
Esempio n. 24
0
def create_test_article(create=True, articleset=None, check_duplicate=False, **kargs):
    """Create a test article"""
    from amcat.models.article import Article

    if "date" in kargs and isinstance(kargs["date"], basestring):
        kargs["date"] = read_date(kargs["date"])

    if "project" not in kargs: kargs["project"] = create_test_project()
    if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1)
    if "medium" not in kargs: kargs["medium"] = create_test_medium()
    if "id" not in kargs: kargs["id"] = _get_next_id()
    if 'headline' not in kargs: kargs['headline'] = 'test headline'
    if 'text' not in kargs: kargs["text"] = "\n\n".join(map(str, range(5)))

    a = Article(**kargs)
    if create:
        Article.create_articles([a], articleset, check_duplicate=check_duplicate, create_id=True)
    return a
Esempio n. 25
0
    def parse_item(self, item):
        #item: a list of html tags
        article = Article(metastring={})
        for tag in item:
            if tag.tag in ("p", "div"):
                if not (hasattr(article, 'text') or article.text):
                    article.text.append(tag)
                else:
                    article.text = [tag]
            elif tag.tag == "h2":
                article.headline = tag.text
            elif tag.tag == "i":
                article = self.parse_dateline(tag.text_content(), article)
        #process html
        article.text = "\n".join(
            [html2text(html.tostring(bit)) for bit in article.text])

        return article
Esempio n. 26
0
    def body_to_article(self, headline, byline, text, date, source, meta):
        """
        Create an Article-object based on given parameters. It raises an
        error (Medium.DoesNotExist) when the given source does not have
        an entry in the database.

        @param headline: headline of new Article-object
        @type headline: unicode / str

        @param byline: byline for new Article
        @type byline: NoneType, unicode, str

        @param text: text for new Article
        @type text: unicode / str

        @param date: date(time) for new Article
        @type date: datetime.date, datetime.datetime

        @param source: medium-label for new Article
        @type source: unicode / str

        @param meta: object containing all sorts of meta-information, most of
                     it suitable for metastring. However, some information
                     (author, length) will be extracted.
        @type meta: dictionary

        @return Article-object

        """
        log.debug(
            "Creating article object for {headline!r}".format(**locals()))

        art = Article(headline=headline, byline=byline, text=text, date=date)

        art.medium = get_or_create(Medium, name=source)

        # Author / Section
        meta = meta.copy()
        art.author = meta.pop('author', None)
        art.section = meta.pop('section', None)
        if 'length' in meta:
            art.length = int(meta.pop('length').split()[0])
        else:
            art.length = art.text.count(" ")
        art.metastring = str(meta)

        art.project = self.options['project']

        return art
Esempio n. 27
0
 def parse_item(self, item):
     #item: a list of html tags
     article = Article()
     article.text = self._parse_text(item)
     headline_found = False
     dateline_found = False
     for tag in item:
         if tag.tag == "h2" and not headline_found:
             if tag.text:
                 article.headline = tag.text
             else:
                 article.headline = tag.cssselect("span")[0].text_content()
             headline_found = True
         elif tag.tag == "i" or (tag.tag == "p" and tag.cssselect("i")) and not dateline_found:
             article = self.parse_dateline(tag.text_content(), article)
             dateline_found = True
     if not article.headline:
         raise Exception("Article has no headline")
     return article
Esempio n. 28
0
 def parse_file(self, file, encoding, data):
     self.ln_query, arts = data
     for data in arts:
         art = {}
         for field, setting in self.options['field_map'].items():
             value, typ = setting['value'], setting['type']
             val = data.get(value) if typ == 'field' else value
             if val:
                 art[field] = val
         yield Article(**art)
Esempio n. 29
0
    def parse_document(self, row):
        kargs = dict(medium=self.options["medium"])
        for fieldname in FIELDS:
            csvfield = self.options[fieldname]
            if not csvfield: continue
            val = row[csvfield]
            if fieldname in PARSERS:
                val = PARSERS[fieldname](val)
            kargs[fieldname] = val

        return Article(**kargs)
Esempio n. 30
0
    def scrape_3(self, _html):
        """Some ugly MS Word format, as of 2014-03-03"""
        # Partition articles
        part = []
        articles = []
        for tag in _html.cssselect("body > div > *"):
            if tag.cssselect("hr"):
                articles.append(part)
                part = []
            else:
                part.append(tag)
        for tags in articles[1:]:
            article = Article()
            dateline = tags[1].text_content().strip()
            article = self.parse_dateline(dateline, article)
            article.headline = tags[1].text_content().strip()
            html_str = "".join([html.tostring(t) for t in tags[2:]])
            article.text = html2text(html_str)
            article.metastring = {'html': html_str}

            yield article
Esempio n. 31
0
    def _scrape_unit(self, url):
        try:
            xml = self.getdoc(url)
        except:
            log.warn("COULD NOT FIND XML FOR %s" % url)
            return
            #return []

        url = url.replace('.xml', '.html')
        metadict = self.getMetaDict(xml, printit=False)
        if len(metadict) == 0:
            log.warn(
                "NO METADATA FOR %s. SKIPPING ARTICLE (to be retrieved after officiele bekendmakingen finalizes it)"
                % url)
            return
            #return []

        section = self.safeMetaGet(metadict, 'OVERHEID.category')

        document_id = metadict['DC.identifier'].strip()
        if document_id.count('-') == 1:
            #kamer = 'NA'
            if 'tweede' in metadict['DC.creator'].lower(): kamer = 'tk'
            if 'eerste' in metadict['DC.creator'].lower(): kamer = 'ek'
            document_id = document_id.replace('-', '-%s-' % kamer)
        print('document id:', document_id)

        author = self.safeMetaGet(metadict, 'OVERHEIDop.indiener')
        typevraag = metadict['DC.type']

        body = self.getBody(xml)
        headline = "document_id (%s)" % author

        try:
            datestring = adhocDatefix(metadict['OVERHEIDop.datumOntvangst'])
        except:
            datestring = adhocDatefix(metadict['OVERHEIDop.datumIndiening'])
            headline += " (publicatiedatum)"

        try:
            date = datetime.datetime.strptime(datestring, '%Y-%m-%d')
        except:
            date = datetime.datetime.strptime(datestring, '%d-%m-%Y')

        #print('--------------\n', document_id, typevraag, '\n', body, '\n\n')
        print("SAVING: %s" % url)

        yield Article(headline=document_id,
                      byline=typevraag,
                      text=body,
                      date=date,
                      section=section,
                      url=url)
 def scrape_3(self, _html):
     """Some ugly MS Word format, as of 2014-03-03"""
     # Partition articles
     part = []
     articles = []
     for tag in _html.cssselect("body > div > *"):
         if tag.cssselect("hr"):
             articles.append(part)
             part = []
         else:
             part.append(tag)
     for tags in articles[1:]:
         article = Article()
         dateline = tags[1].text_content().strip()
         article = self.parse_dateline(dateline, article)
         article.headline = tags[1].text_content().strip()
         html_str = "".join([html.tostring(t) for t in tags[2:]])
         article.text = html2text(html_str)
         article.metastring = {'html': html_str}
         
         yield article
Esempio n. 33
0
    def scrape_1(self, _html, t):
        """format of mostly 2013"""
        if "werkmap" in t:
            divs = _html.cssselect("#articleTable div")
        elif "intranet/rss" in t:
            divs = [div for div in _html.cssselect("#sort div") if "sort_" in div.get('id')]
        else:
            raise ValueError("Neither 'werkmap' nor 'intranet/rss' in html.")

        for div in divs:
            article = Article(metastring=div.text_content())
            article.headline = div.cssselect("#articleTitle")[0].text_content()
            article.text = div.cssselect("#articleIntro")[0].text_content()
            articlepage = div.cssselect("#articlePage")

            if articlepage:
                article.pagenr, article.section = self.get_pagenum(articlepage[0].text_content())

            article.medium = self.get_medium(div.cssselect("#sourceTitle")[0].text_content())
            date_str = div.cssselect("#articleDate")[0].text_content()

            try:
                article.date = readDate(date_str)
            except ValueError:
                log.error("parsing date \"{date_str}\" failed".format(**locals()))
            else:
                yield article
Esempio n. 34
0
def get_article(e):
    headline = get_headline(e)
    body = get_body(e)
    medium, date, page = get_meta(e)
    section = get_section(e)
    medium = get_or_create(Medium, name=medium)

    return Article(headline=headline,
                   text=body,
                   date=date,
                   pagenr=page,
                   section=section,
                   medium=medium)
Esempio n. 35
0
 def parse_file(self, file, encoding, data):
     self.ln_query, arts = data
     for data in arts:
         art = {}
         for field, setting in self.options['field_map'].items():
             datatype = get_property_primitive_type(field)
             value, typ = setting['value'], setting['type']
             val = data.get(value) if typ == 'field' else value
             if val:
                 if datatype is datetime.datetime and type(val) is str:
                     val = toolkit.read_date(val)
                 art[field] = val
         yield Article(**art)
Esempio n. 36
0
def create_test_article(create=True,
                        articleset=None,
                        check_duplicate=False,
                        **kargs):
    """Create a test article"""
    from amcat.models.article import Article

    if "date" in kargs and isinstance(kargs["date"], basestring):
        kargs["date"] = read_date(kargs["date"])

    if "project" not in kargs: kargs["project"] = create_test_project()
    if "date" not in kargs: kargs["date"] = datetime.date(2000, 1, 1)
    if "medium" not in kargs: kargs["medium"] = create_test_medium()
    if "id" not in kargs: kargs["id"] = _get_next_id()
    if 'headline' not in kargs: kargs['headline'] = 'test headline'
    if 'text' not in kargs: kargs["text"] = "\n\n".join(map(str, range(5)))

    a = Article(**kargs)
    if create:
        Article.create_articles([a],
                                articleset,
                                check_duplicate=check_duplicate,
                                create_id=True)
    return a
Esempio n. 37
0
def get_article(e):
    title = get_title(e)
    body = get_body(e)
    medium, date, page = get_meta(e)
    section = get_section(e)

    article = Article(title=title, text=body, date=date)

    if page is not None:
        article.set_property("page_num", page)

    if section is not None:
        article.set_property("section", section)

    if medium is not None:
        article.set_property("medium", medium)

    return article
Esempio n. 38
0
    def add_articles(self,
                     articles,
                     add_to_index=True,
                     monitor=ProgressMonitor()):
        """
        Add the given articles to this articleset. Implementation is exists of three parts:

          1. Adding ArticleSetArticle objects
          2. Adding CodedArticle objects
          3. Updating index

        @param articles: articles to be removed
        @type articles: iterable with indexing of integers or Article objects

        @param add_to_index: notify elasticsearch of changes
        @type add_to_index: bool
        """
        articles = {(art if type(art) is int else art.id) for art in articles}
        to_add = articles - self.get_article_ids()
        # Only use articles that exist
        to_add = list(Article.exists(to_add))

        monitor.update(10,
                       "{n} articles need to be added".format(n=len(to_add)))
        ArticleSetArticle.objects.bulk_create(
            [
                ArticleSetArticle(articleset=self, article_id=artid)
                for artid in to_add
            ],
            batch_size=100,
        )

        monitor.update(
            20,
            "{n} articleset articles added to database, adding to codingjobs".
            format(n=len(to_add)))
        cjarts = [
            CodedArticle(codingjob=c, article_id=a)
            for c, a in itertools.product(self.codingjob_set.all(), to_add)
        ]
        CodedArticle.objects.bulk_create(cjarts)

        monitor.update(
            30, "{n} articles added to codingjobs, adding to index".format(
                n=len(cjarts)))
        if add_to_index:
            amcates.ES().add_to_set(self.id, to_add, monitor=monitor)
Esempio n. 39
0
    def _scrape_unit(self, ftuple):
        title = ftuple[0]
        url = ftuple[1]
        body = ftuple[2]

        date = getDate(url)
        medium = title.lower()
        med = Medium.get_or_create(medium)

        headline = "%s (%s)" % (medium, url.split('/')[-1].replace('.stl',
                                                                   '').strip())
        art = Article(headline=headline,
                      text=body,
                      medium=med,
                      date=date,
                      url=url)
        yield art
Esempio n. 40
0
    def add_articles(self, article_ids, add_to_index=True, monitor=NullMonitor()):
        """
        Add the given articles to this articleset. Implementation is exists of three parts:

          1. Adding ArticleSetArticle objects
          2. Adding CodedArticle objects
          3. Updating index

        @param article_ids: articles to be removed
        @type article_ids: iterable with indexing of integers or Article objects

        @param add_to_index: notify elasticsearch of changes
        @type add_to_index: bool
        """
        monitor = monitor.submonitor(total=4)

        article_ids = {(art if type(art) is int else art.id) for art in article_ids}

        # Only use articles that exist
        to_add = article_ids - self.get_article_ids()
        to_add = list(Article.exists(to_add))

        monitor.update(message="Adding {n} articles to {aset}..".format(n=len(to_add), aset=self))
        ArticleSetArticle.objects.bulk_create(
            [ArticleSetArticle(articleset=self, article_id=artid) for artid in to_add],
            batch_size=100,
        )

        monitor.update(message="{n} articleset articles added to database, adding to codingjobs..".format(n=len(to_add)))
        cjarts = [CodedArticle(codingjob=c, article_id=a) for c, a in itertools.product(self.codingjob_set.all(), to_add)]
        CodedArticle.objects.bulk_create(cjarts)

        if add_to_index:
            monitor.update(message="{n} articles added to codingjobs, adding to index".format(n=len(cjarts)))
            es = ES()
            es.add_to_set(self.id, to_add, monitor=monitor)
            es.refresh()  # We need to flush, or setting cache will fail
            # Add to property cache
            properties = ES().get_used_properties(article_ids=to_add)
            self._add_to_property_cache(properties)
        else:
            monitor.update(2)
Esempio n. 41
0
    def parse_document(self, row):
        kargs = dict(medium=self._medium, metastring={})
        csvfields = [(fieldname, self.options[fieldname])
                     for fieldname in FIELDS if self.options[fieldname]]
        for fieldname, csvfield in csvfields:
            val = row[csvfield]
            if fieldname == 'date' and isinstance(val, datetime.datetime):
                pass  # no need to parse
            elif val.strip():
                if fieldname in PARSERS:
                    val = PARSERS[fieldname](val)
            elif is_nullable(fieldname):
                val = None
            else:
                val = val.strip()

            kargs[fieldname] = val

        # Metadata to metastring
        csvfields = [tup[1] for tup in csvfields]
        for key, value in row.items():
            if key not in csvfields:
                kargs["metastring"][key] = value

        kargs["metastring"] = json.dumps(kargs["metastring"])

        # In case medium wasn't defined in csv
        medium = self._medium
        if medium is not None:
            kargs["medium"] = medium

        if self.parent_field:
            doc_id = kargs.get(self.id_field)
            parent_id = kargs.pop(self.parent_field)
            if parent_id:
                self.parents[doc_id] = parent_id

        article = Article(**kargs)
        if self.parent_field:
            self.articles[doc_id] = article

        return article
Esempio n. 42
0
 def _scrape_unit(self, _file):
     """unit: a pdf document"""
     res = ""
     parser = PDFParser()
     doc = parser.load_document(_file, self.options['pdf_password'])
     for page in parser.process_document(doc):
         page_txt = ""
         for line in parser.get_textlines(page):
             page_txt += line.get_text() + "\n"
         res += page_txt + "\n\n"
     article = Article(text=res)
     article.headline = self.getheadline(_file)
     article.medium = self.options['medium']
     article.section = self.options['section']
     if self.options['date']:
         article.date = self.options['date']
     else:
         article.date = date.today()
     yield article
Esempio n. 43
0
    def parse_document(self, tupleText):
        meta, body = tupleText
        meta = meta.strip()
        meta = meta.split('\n')
        kargs = {}
        kargs['externalid'] = int(meta[0].split('.')[0].lstrip('?'))
        kargs['headline'] = meta[0].partition('. ')[2]

        medium_name, date, pagenr, length = meta[2].split(', ')
        kargs['medium'] = get_or_create_medium(medium_name)
        kargs['date'] = readDate(date)
        kargs['pagenr'] = int(pagenr.strip('p.'))
        kargs['length'] = int(length.strip('w.'))

        body = body.split('\n')
        kargs['section'] = body[2]

        kargs['text'] = '\n'.join(body[5:])

        kargs['project'] = self.options['project']

        return Article(**kargs)
Esempio n. 44
0
    def parse_file(self, file):
        dirname, filename = os.path.split(file.name)
        filename, ext = os.path.splitext(filename)

        def parse_field(file, type, value):
            if type == 'literal':
                return value
            if value == 'filename':
                return filename
            if value == 'text':
                return file.read()
            if value.startswith('filename-'):
                n = int(value.split("-")[-1])
                return filename.split("_")[n -
                                           1]  # filename-n is 1 based index
            raise ValueError("Can't parse field {value}".format(**locals()))

        fields = {
            field: parse_field(file, **setting)
            for (field, setting) in self.options['field_map'].items()
        }
        return [Article(**fields)]
Esempio n. 45
0
    def parse_file(self, file, encoding, _data):
        path, filename = os.path.split(file)
        filename, ext = os.path.splitext(filename)

        def parse_field(file, type, value):
            if type == 'literal':
                return value
            if value == 'Filename':
                return filename
            if value == 'Text':
                return _read(file, encoding)
            if value == 'Path':
                return path
            if value.startswith('Filename part '):
                n = int(value.replace("Filename part ", ""))
                return filename.split("_")[n -
                                           1]  # filename-n is 1 based index
            raise ValueError("Can't parse field {value}".format(**locals()))

        fields = {
            field: parse_field(file, **setting)
            for (field, setting) in self.options['field_map'].items()
        }
        return [Article(**fields)]
Esempio n. 46
0
 def _scrape_unit(self, _file):
     """unit: a pdf document"""
     res = ""
     parser = PDFParser()
     doc = parser.load_document(_file, self.options['pdf_password'])
     for page in parser.process_document(doc):
         page_txt = ""
         for line in parser.get_textlines(page):
             page_txt += line.get_text() + "\n"
         res += page_txt + "\n\n"
     article = Article(text = res)
     article.headline = self.getheadline(_file)
     article.medium = self.options['medium']
     article.section = self.options['section']
     if self.options['date']:
         article.date = self.options['date']
     else:
         article.date = date.today()
     yield article
Esempio n. 47
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = read_date(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():
                article.title = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = read_date(datestr)
                    if (
                                article.date - file_date).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = read_date(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.set_property("medium", medium_str)
                article.set_property("section", data[1])

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode('latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub(
            "=[A-Z0-9]{2}",
            character,
            article.text)

        yield article
Esempio n. 48
0
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = read_date(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"):  #actual content starts
                lines.append("")

        article = Article(metastring={'mail_header': "".join(mail_header)})

        while True:  #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper():
                article.title = line
                break
            elif line:  #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr:  # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = read_date(datestr)
                    if (
                            article.date - file_date
                    ).days > 200:  #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years=1)
                else:
                    article.date = read_date(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.set_property("medium", medium_str)
                article.set_property("section", data[1])

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper():  #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(
                    p.split(",")) > 1:  #laatste regel van normale content
                break

        # Add non-ascii characters
        # Takes the '=AB' occurrences and turns them into latin-1 characters.
        def character(match):
            code = match.group()[1:]
            char = r"\x{}".format(code).decode('string-escape').decode(
                'latin-1')
            if code == "92":
                return "'"
            elif code == "85":
                return "..."
            return char

        article.text = re.sub("=[A-Z0-9]{2}", character, article.text)

        yield article
    def _scrape_unit(self, _file):
        readlines = _file.readlines()
        file_date_line = [l for l in readlines if l.startswith("Date:")][0]
        file_date = readDate(file_date_line.split("Date:")[1])

        lines = []
        mail_header = []
        for line in readlines:
            if lines:
                lines.append(line.rstrip("\r\n"))
            else:
                mail_header.append(line)
            if line.startswith("1red"): #actual content starts
                lines.append("")

        article = Article(metastring = {'mail_header': "".join(mail_header)})

        while True: #loop through lines up to and including headline
            line = lines.pop(0)
            if line.isupper(): #headline
                article.headline = line
                break
            elif line: #first non-empty line, contains metadata
                data = line.split(", ")
                datestr = data[0]
                if "'" in datestr:
                    split = datestr.split("'")
                    datestr = split[0] + "20" + split[1]
                if "=" in datestr: # if this is true, the year is not parsable
                    # we take the year the mail was sent, might fail around december
                    datestr = datestr.split("=")[0] + str(file_date.year)
                    article.date = readDate(datestr)
                    if (article.date - file_date).days > 200: #likely a misparse, with the mail being sent the next year
                        article.date -= timedelta(years = 1)
                else:
                    article.date = readDate(datestr)
                if data[2] in BZK_ALIASES.keys():
                    medium_str = BZK_ALIASES[data[1]]
                else:
                    medium_str = data[2]
                article.medium = Medium.get_or_create(medium_str)
                article.section = data[1]

        paragraphs = []
        paragraph = ""
        while True:
            line = lines.pop(0).rstrip("=")
            if not line:
                paragraphs.append(paragraph)
                paragraph = ""
            elif line.isupper(): #subheader
                paragraph += line + "\n"
            else:
                paragraph += line
            if not lines:
                break
        paragraphs = [p for p in paragraphs if p]

        article.text = ""
        for p in paragraphs:
            article.text += p + "\n\n"
            if p.startswith("(") and len(p.split(",")) > 1: #laatste regel van normale content
                break

        yield article
Esempio n. 50
0
 def parse_file(self, file, _):
     for art_dict in self._scrape_unit(file.read()):
         yield Article.fromdict(self.map_article(art_dict, DEFAULTS))