Beispiel #1
0
def article_to_json(article: Article):
    static_fields = article.static_fields() - {
        "id", "project_id", "project", "properties"
    }
    static_fields = {fn: getattr(article, fn) for fn in static_fields}
    return dict(static_fields,
                properties=dict(article.get_properties().items()))
Beispiel #2
0
    def test_list_media(self):
        """Test that list media works for more than 10 media"""
        from amcat.models import Article
        media = [amcattest.create_test_medium() for _ in range(20)]
        arts = [
            amcattest.create_test_article(medium=m, create=False)
            for m in media
        ]

        s1 = amcattest.create_test_set()
        Article.create_articles(arts[:5],
                                articleset=s1,
                                check_duplicate=False,
                                create_id=True)
        ES().flush()
        self.assertEqual(set(s1.get_mediums()), set(media[:5]))

        s2 = amcattest.create_test_set(project=s1.project)
        Article.create_articles(arts[5:],
                                articleset=s2,
                                check_duplicate=False,
                                create_id=True)
        ES().flush()
        self.assertEqual(set(s2.get_mediums()), set(media[5:]))

        self.assertEqual(set(s1.project.get_mediums()), set(media))
Beispiel #3
0
    def setup(self):
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies',
                                          title='m1',
                                          date='2001-01-01',
                                          create=False)
        b = amcattest.create_test_article(text='noot mies wim zus',
                                          title='m2',
                                          date='2001-02-01',
                                          create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet',
                                          title='m2',
                                          date='2002-01-01',
                                          create=False)
        d = amcattest.create_test_article(text='noot mies wim zus',
                                          title='m2',
                                          date='2001-02-03',
                                          create=False)
        e = amcattest.create_test_article(text='aap noot mies',
                                          title='m3',
                                          articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1)
        ES().refresh()
        return s1, s2, a, b, c, d, e
Beispiel #4
0
    def test_aggregate(self):
        """Can we make tables per medium/date interval?"""
        from amcat.models import Article
        m1 = amcattest.create_test_medium(name="De Nep-Krant")
        m2, m3 = [amcattest.create_test_medium() for _ in range(2)]
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        unused = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2)
        a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False)
        b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False)
        d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False)

        Article.create_articles([a,b,c,d], articleset=s1, check_duplicate=False, create_id=True)
        ES().flush()

        self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="mediumid")),
                         {m1.id : 1, m2.id : 3})

        self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")),
                         {datetime(2001,1,1) : 3, datetime(2002,1,1) : 1})

        self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")),
                         {datetime(2001,1,1) : 1, datetime(2002,1,1) : 1, datetime(2001,2,1) : 2})

        # set statistics
        stats = ES().statistics(filters=dict(sets=s1.id))
        self.assertEqual(stats.n, 4)
        self.assertEqual(stats.start_date, datetime(2001,1,1))
        self.assertEqual(stats.end_date, datetime(2002,1,1))

        # media list
        self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))),
                         {m1.id, m2.id})
Beispiel #5
0
    def test_get_articles(self):
        from amcat.models import Sentence
        _get_articles = lambda a,s : list(get_articles(a,s))

        # Should raise exception if sentences not in article
        article, sentences = self.create_test_sentences()
        s1 = Sentence.objects.filter(id=amcattest.create_test_sentence().id)
        self.assertRaises(ValueError, _get_articles, article, s1)

        # Should raise an exception if we try to split on headline
        self.assertRaises(ValueError, _get_articles, article, sentences.filter(parnr=1))

        # Should return a "copy", with byline in "text" property
        arts = _get_articles(article, Sentence.objects.none())
        Article.create_articles(arts)

        self.assertEquals(len(arts), 1)
        sbd.create_sentences(arts[0])

        self.assertEquals(
            [s.sentence for s in sentences[1:]],
            [s.sentence for s in arts[0].sentences.all()[1:]]
        )

        self.assertTrue("foo" in arts[0].text)

        # Should be able to split on byline
        self.assertEquals(2, len(_get_articles(article, sentences[1:2])))
        a, b = _get_articles(article, sentences[4:5])

        # Check if text on splitted articles contains expected
        self.assertTrue("Einde" not in a.text)
        self.assertTrue("Einde" in b.text)
Beispiel #6
0
    def _run(self, local_project, remote_host, remote_token, remote_project_id,
             remote_articleset_id):
        try:
            page_size = 1000
            query = RemoteQuery(remote_host,
                                remote_token,
                                remote_project_id,
                                remote_articleset_id,
                                page_size=page_size)
            set = {
                k: v
                for k, v in query.get_articleset().items()
                if k in COPY_SET_FIELDS
            }
            set.update(project=local_project)
            set = ArticleSet.objects.create(**set)
            for page in query:
                articles_hashes = [(self.create_article(x, local_project),
                                    x["hash"]) for x in page]
                hashmap = {
                    old_hash: article.hash
                    for article, old_hash in articles_hashes
                }
                articles, _ = zip(*articles_hashes)
                articles = list(articles)
                for article in articles:
                    if article.parent_hash in hashmap:
                        article.parent_hash = hashmap[article.parent_hash]

                Article.create_articles(articles, articleset=set)
            return set.id
        except APIError as e:
            self.handleError(e)
Beispiel #7
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "section": "\u6f22\u5b57",
            "pagenr": 1928390,
            "headline": "Headline hier.",
            "byline": "byline..",
            "length": 1928,
            "metastring": "Even more strange characters.. \x0C ..",
            "url": "https://example.com",
            "externalid": None,
            "author": None,
            "addressee": "Hmm",
            "text": "Contains invalid char \x08 woo",
            "medium": create_test_medium(name="abc."),
            "project": create_test_project()
        })

        article.save()

        es = ES()
        es.add_articles([article.id])
        hash = get_article_dict(article)["hash"]
        es.flush()

        es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"])
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(_get_hash(es_article.to_dict()), hash)
Beispiel #8
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "section": "\u6f22\u5b57",
            "pagenr": 1928390,
            "headline": "Headline hier.",
            "byline": "byline..",
            "length": 1928,
            "metastring": "Even more strange characters.. \x0C ..",
            "url": "https://example.com",
            "externalid": None,
            "author": None,
            "addressee": "Hmm",
            "text": "Contains invalid char \x08 woo",
            "medium": create_test_medium(name="abc."),
            "project": create_test_project()
        })

        article.save()

        es = ES()
        es.add_articles([article.id])
        hash = get_article_dict(article)["hash"]
        es.flush()

        es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"])
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(_get_hash(es_article.to_dict()), hash)
Beispiel #9
0
    def create(self, validated_data):
        children = validated_data.pop("children")
        article = Article(**validated_data)

        if article.length is None:
            article.length = word_len(article.text)

        return (article, map(self.create, children))
Beispiel #10
0
    def create(self, validated_data):
        children = validated_data.pop("children")
        article = Article(**validated_data)

        if article.length is None:
            article.length = word_len(article.text)

        return (article, map(self.create, children))
Beispiel #11
0
 def scrape_unit(self, unit):
     date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None)
     hostname = urlparse(unit["url"]).hostname
     publisher = ".".join(hostname.split(".")[-2:])
     title = unit["titel"].strip() or "[No title]"
     article = Article(title=title, text=unit["bericht tekst"], url=unit["url"], date=date)
     article.set_property("author", unit["auteur"])
     article.set_property("publisher", publisher)
     return article
Beispiel #12
0
    def test_get_ids(self):
        tree = ArticleTree(
            Article(id=3), [
                ArticleTree(Article(id=5), []), ArticleTree(Article(id=6), [
                    ArticleTree(Article(id=7), [])
                ])
            ]
        )

        self.assertEqual({3, 5, 6, 7}, set(tree.get_ids()))
Beispiel #13
0
 def test_create(self):
     """Can we create/store/index an article object?"""
     a = amcattest.create_test_article(create=False, date='2010-12-31', headline=u'\ua000abcd\u07b4')
     Article.create_articles([a], create_id=True)
     db_a = Article.objects.get(pk=a.id)
     amcates.ES().flush()
     es_a = list(amcates.ES().query(filters={'ids': [a.id]}, fields=["date", "headline"]))[0]
     self.assertEqual(a.headline, db_a.headline)
     self.assertEqual(a.headline, es_a.headline)
     self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat())
     self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
Beispiel #14
0
    def _parse_comment(self, comment, base_title, base_url):
        text = html2text(comment.cssselect("p"))
        article_id = comment.get("id")
        title = "{base_title}#{article_id}".format(**locals())
        url = "{base_url}#{article_id}".format(**locals())
        author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content())

        article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url)
        article.set_property("author", author.strip())
        article.set_property("medium", "GeenStijl Comments")
        return article
Beispiel #15
0
    def setup(self):
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies', title='m1', date='2001-01-01', create=False)
        b = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-01', create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2', date='2002-01-01', create=False)
        d = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-03', create=False)
        e = amcattest.create_test_article(text='aap noot mies', title='m3', articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1)
        ES().refresh()
        return s1, s2, a, b, c, d, e
Beispiel #16
0
 def test_create_order(self):
     """Is insert order preserved in id order?"""
     articles = [amcattest.create_test_article(create=False) for _i in range(25)]
     random.shuffle(articles)
     Article.create_articles(articles)
     ids = [a.id for a in articles]
     # is order preserved?
     self.assertEqual(ids, sorted(ids))
     # do the right articles have the right title?
     for saved in articles:
         indb = Article.objects.get(pk=saved.id)
         self.assertEqual(indb.title, saved.title)
Beispiel #17
0
 def test_create_order(self):
     """Is insert order preserved in id order?"""
     articles = [amcattest.create_test_article(create=False) for _i in range(25)]
     random.shuffle(articles)
     Article.create_articles(articles)
     ids = [a.id for a in articles]
     # is order preserved?
     self.assertEqual(ids, sorted(ids))
     # do the right articles have the right title?
     for saved in articles:
         indb = Article.objects.get(pk=saved.id)
         self.assertEqual(indb.title, saved.title)
Beispiel #18
0
    def create(self, validated_data):
        articleset = self.get_articleset()

        if 'id' in validated_data:
            _check_read_access(self.context['request'].user, [validated_data['id']])
            article = Article.objects.get(pk=validated_data['id'])
            articleset.add_articles([article])
        else:
            article = json_to_article(validated_data, articleset.project)
            Article.create_articles([article], articleset=articleset)

        return article
Beispiel #19
0
    def test_query_all(self):
        """Test that query_all works"""
        from amcat.models import Article
        arts = [amcattest.create_test_article(create=False) for _ in range(20)]
        s = amcattest.create_test_set()
        Article.create_articles(arts, articleset=s, check_duplicate=False)
        ES().flush()

        r = ES().query(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), 10)

        r = ES().query_all(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), len(arts))
Beispiel #20
0
 def test_create(self):
     """Can we create/store/index an article object?"""
     a = amcattest.create_test_article(create=False, date='2010-12-31', title=u'\ua000abcd\u07b4')
     Article.create_articles([a])
     db_a = Article.objects.get(pk=a.id)
     amcates.ES().refresh()
     es_a = list(amcates.ES().query(filters={'ids': [a.id]}, _source=["date", "title", "hash"]))[0]
     self.assertEqual(a.hash, db_a.hash)
     self.assertEqual(a.hash, es_a.hash)
     self.assertEqual(a.title, db_a.title)
     self.assertEqual(a.title, es_a.title)
     self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat())
     self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
Beispiel #21
0
    def create(self, validated_data):
        articleset = self.get_articleset()

        if 'id' in validated_data:
            _check_read_access(self.context['request'].user,
                               [validated_data['id']])
            article = Article.objects.get(pk=validated_data['id'])
            articleset.add_articles([article])
        else:
            article = json_to_article(validated_data, articleset.project)
            Article.create_articles([article], articleset=articleset)

        return article
Beispiel #22
0
    def test_query_all(self):
        """Test that query_all works"""
        from amcat.models import Article
        arts = [amcattest.create_test_article(create=False) for _ in range(20)]
        s = amcattest.create_test_set()
        Article.create_articles(arts, articleset=s, check_duplicate=False, create_id=True)
        ES().flush()

        r = ES().query(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), 10)

        r = ES().query_all(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), len(arts))
Beispiel #23
0
    def setup(self):
        m1 = amcattest.create_test_medium(name="De Nep-Krant")
        m2, m3 = [amcattest.create_test_medium() for _ in range(2)]
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False)
        b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False)
        d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False)
        e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1)
        ES().flush()
        return m1, m2, m3, s1, s2, a, b, c, d, e
Beispiel #24
0
    def setup(self):
        m1 = amcattest.create_test_medium(name="De Nep-Krant")
        m2, m3 = [amcattest.create_test_medium() for _ in range(2)]
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False)
        b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False)
        d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False)
        e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1, check_duplicate=False, create_id=True)
        ES().flush()
        return m1, m2, m3, s1, s2, a, b, c, d, e
Beispiel #25
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        p = amcattest.create_test_project()

        arts = [amcattest.create_test_article(project=p, create=False) for _x in range(1213)]
        Article.create_articles(arts, s)
        ES().refresh()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts)
        ES().refresh()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Beispiel #26
0
    def scrape_unit(self, date_and_article_url):
        date, article_url = date_and_article_url
        log.info("Fetching {}".format(article_url))
        article_doc = self.session.get_html(article_url)

        article_el = article_doc.cssselect("#content > article")

        if not article_el:
            log.error("Could not find article on {article_url}".format(**locals()))
            return None

        title = article_el[0].cssselect("h1")[0].text
        text = html2text(article_el[0].cssselect("p"))
        text = text.strip() or "."

        try:
            footer = article_el[0].cssselect("footer")[0]
        except IndexError as e:
            # Contains <embed> tag which is not closed gracefully :-(
            log.exception(e)
            return None

        author = footer.text.rsplit("|", 1)[0].strip()
        timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime"))
        if not title:
            return None

        children = self._get_comments(title, article_url, article_doc)

        article = Article(date=timestamp, title=title, text=text)
        article.set_property("author", author)
        article.set_property("url", article_url)
        article.set_property("medium", "GeenStijl")

        return ArticleTree(article, [ArticleTree(c, []) for c in children])
Beispiel #27
0
    def run(self):
        monitor = self.progress_monitor

        filename = self.options['filename']
        file_shortname = os.path.split(self.options['filename'])[-1]
        monitor.update(
            10,
            u"Importing {self.__class__.__name__} from {file_shortname} into {self.project}"
            .format(**locals()))

        articles = []
        encoding = self.options['encoding']
        files = list(self._get_files(filename, encoding))
        nfiles = len(files)
        for i, (file, encoding, data) in enumerate(files):
            monitor.update(
                20 / nfiles,
                "Parsing file {i}/{nfiles}: {file}".format(**locals()))
            articles += list(self.parse_file(file, encoding, data))

        for article in articles:
            _set_project(article, self.project)

        if self.errors:
            raise ParseError(" ".join(map(str, self.errors)))
        monitor.update(
            10,
            "All files parsed, saving {n} articles".format(n=len(articles)))
        Article.create_articles(articles,
                                articleset=self.get_or_create_articleset(),
                                monitor=monitor.submonitor(40))

        if not articles:
            raise Exception("No articles were imported")

        monitor.update(
            10,
            "Uploaded {n} articles, post-processing".format(n=len(articles)))

        aset = self.options["articleset"]
        new_provenance = self.get_provenance(file, articles)
        aset.provenance = ("%s\n%s" %
                           (aset.provenance or "", new_provenance)).strip()
        aset.save()

        if getattr(self, 'task', None):
            self.task.log_usage("articles", "upload", n=len(articles))

        monitor.update(10, "Done! Uploaded articles".format(n=len(articles)))
        return self.options["articleset"]
Beispiel #28
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        m = amcattest.create_test_medium()
        p = amcattest.create_test_project()

        arts = [amcattest.create_test_article(project=p, medium=m, create=False) for _x in range(1213)]
        Article.create_articles(arts, s, create_id=True)
        ES().flush()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts, monitor=ProgressMonitor())
        ES().flush()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Beispiel #29
0
def handle_split(form, project, article, sentences):
    articles = list(get_articles(article, sentences))

    # We won't use bulk_create yet, as it bypasses save() and doesn't
    # insert ids
    Article.create_articles(articles)
    for art in articles:
        sbd.get_or_create_sentences(art)

    if not form.is_valid():
        raise ValueError("Form invalid: {form.errors}".format(**locals()))

    # Context variables for template
    form_data = form.cleaned_data
    all_sets = list(project.all_articlesets().filter(articles=article))

    # Add splitted articles to existing sets
    for aset in form_data["add_splitted_to_sets"]:
        aset.add_articles(articles)

    # Add splitted articles to sets wherin the original article live{d,s}
    if form_data["add_splitted_to_all"]:
        asets = project.all_articlesets().filter(articles=article).only("id")
        for aset in asets:
            aset.add_articles(articles)

    if form_data["remove_from_sets"]:
        for aset in form_data["remove_from_sets"]:
            aset.remove_articles([article])

    if form_data["remove_from_all_sets"]:
        for aset in ArticleSet.objects.filter(project=project,
                                              articles=article).distinct():
            aset.remove_articles([article])

    if form_data["add_splitted_to_new_set"]:
        new_splitted_set = ArticleSet.create_set(
            project, form_data["add_splitted_to_new_set"], articles)

    if form_data["add_to_sets"]:
        for articleset in form_data["add_to_sets"]:
            articleset.add_articles([article])

    if form_data["add_to_new_set"]:
        new_set = ArticleSet.create_set(project, form_data["add_to_new_set"],
                                        [article])

    return locals()
    def run(self, scraper):
        try:
            units = list(scraper._get_units())
        except Exception as e:
            self.errors.append(ScrapeError(None,None,e))
            log.exception("scraper._get_units failed")
            return self.articles

        for i, unit in enumerate(units):
            try:
                articles = list(scraper._scrape_unit(unit))
            except Exception as e:
                log.exception("scraper._scrape_unit failed")
                self.errors.append(ScrapeError(i,unit,e))
                continue
            self.articles += articles

        for article in self.articles:
            _set_default(article, 'project', scraper.project)

        try:
            articles, errors = Article.create_articles(self.articles, scraper.articleset)
            self.saved_article_ids = {getattr(a, "duplicate_of", a.id) for a in self.articles}
            for e in errors:
                self.errors.append(ScrapeError(None,None,e))
        except Exception as e:
            self.errors.append(ScrapeError(None,None,e))
	    print e
            log.exception("scraper._get_units failed")

        return self.saved_article_ids
Beispiel #31
0
    def run(self, scraper):
        try:
            units = list(scraper._get_units())
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")
            return self.articles

        for i, unit in enumerate(units):
            try:
                articles = list(scraper._scrape_unit(unit))
            except Exception as e:
                log.exception("scraper._scrape_unit failed")
                self.errors.append(ScrapeError(i, unit, e))
                continue
            self.articles += articles

        for article in self.articles:
            _set_default(article, 'project', scraper.project)

        try:
            articles, errors = Article.create_articles(self.articles,
                                                       scraper.articleset)
            self.saved_article_ids = {
                getattr(a, "duplicate_of", a.id)
                for a in self.articles
            }
            for e in errors:
                self.errors.append(ScrapeError(None, None, e))
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")

        return self.saved_article_ids
Beispiel #32
0
    def scrape_unit(self, url):
        reader_url = "about:reader?url={}".format(url)
        doc = self.get_html(reader_url, wait_for="div.content p")

        for tag in REMOVE_TAGS:
            for element in doc.cssselect(tag):
                element.getparent().remove(element)

        article = doc.cssselect("div.content")[0]
        article_html = lxml.html.tostring(article).decode()

        title = doc.cssselect("h1.reader-title")[0].text_content().strip()
        text = html2text(article_html)

        if self.__class__.get_date is not GenericScraper.get_date:
            # Get contents of un-firefox-read-ed article
            self.wait(".reader-toolbar .close-button").click()
            time.sleep(0.3)
            doc_html = self.wait("html").get_attribute("outerHTML")
            doc = lxml.html.fromstring(doc_html, base_url=url)

            try:
                date = self.get_date(doc)
            except NotImplementedError:
                date = self.now
            except Exception as e:
                log.warning("get_date() failed for {} with: {}".format(url, e))
                date = self.now
        else:
            date = self.now

        article = Article(date=date, title=title, text=text, url=url)

        return article
Beispiel #33
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        p = amcattest.create_test_project()

        arts = [
            amcattest.create_test_article(project=p, create=False)
            for _x in range(1213)
        ]
        Article.create_articles(arts, s)
        ES().refresh()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts)
        ES().refresh()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Beispiel #34
0
    def test_list_media(self):
        """Test that list media works for more than 10 media"""
        from amcat.models import Article
        media =  [amcattest.create_test_medium() for _ in range(20)]
        arts = [amcattest.create_test_article(medium=m, create=False) for m in media]

        s1 = amcattest.create_test_set()
        Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True)
        ES().flush()
        self.assertEqual(set(s1.get_mediums()), set(media[:5]))

        s2 = amcattest.create_test_set(project=s1.project)
        Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True)
        ES().flush()
        self.assertEqual(set(s2.get_mediums()), set(media[5:]))

        self.assertEqual(set(s1.project.get_mediums()), set(media))
Beispiel #35
0
 def create_articles(batch):
     for a in batch:
         a['oldid_int'] = a.pop('old_id')
         if a['text'] == '': a['text'] = '-'
         if a['title'] == '': a['title'] = '-'
     articles = Article.create_articles([Article(project_id=self.status.project.id, **a) for a in batch])
     self.status.articles.update({a.get_property('oldid_int'): a.id for a in articles})
     return articles
Beispiel #36
0
def handle_split(form, project, article, sentences):
    articles = list(get_articles(article, sentences))

    # We won't use bulk_create yet, as it bypasses save() and doesn't
    # insert ids
    Article.create_articles(articles)
    for art in articles:
        sbd.get_or_create_sentences(art)

    if not form.is_valid():
        raise ValueError("Form invalid: {form.errors}".format(**locals()))

    # Context variables for template
    form_data = form.cleaned_data
    all_sets = list(project.all_articlesets().filter(articles=article))

    # Add splitted articles to existing sets
    for aset in form_data["add_splitted_to_sets"]:
        aset.add_articles(articles)

    # Add splitted articles to sets wherin the original article live{d,s}
    if form_data["add_splitted_to_all"]:
        asets = project.all_articlesets().filter(articles=article).only("id")
        for aset in asets:
            aset.add_articles(articles)

    if form_data["remove_from_sets"]:
        for aset in form_data["remove_from_sets"]:
            aset.remove_articles([article])

    if form_data["remove_from_all_sets"]:
        for aset in ArticleSet.objects.filter(project=project, articles=article).distinct():
            aset.remove_articles([article])

    if form_data["add_splitted_to_new_set"]:
        new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles)

    if form_data["add_to_sets"]:
        for articleset in form_data["add_to_sets"]:
            articleset.add_articles([article])

    if form_data["add_to_new_set"]:
        new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article])

    return locals()
Beispiel #37
0
 def parse_file(self, file, encoding, _data):
     reader = csv.DictReader(_open(file, encoding))
     for unmapped_dict in reader:
         art_dict = self.map_article(unmapped_dict)
         properties = {}
         for k, v in art_dict.items():
             v = parse_value(k, v)
             properties[k] = v
         yield Article.fromdict(properties)
Beispiel #38
0
 def parse_file(self, file):
     reader = csv.DictReader(TextIOWrapper(file.file, encoding="utf8"))
     for unmapped_dict in reader:
         art_dict = self.map_article(unmapped_dict)
         properties = {}
         for k, v in art_dict.items():
             v = self.parse_value(k, v)
             properties[k] = v
         yield Article.fromdict(properties)
Beispiel #39
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        m = amcattest.create_test_medium()
        p = amcattest.create_test_project()

        arts = [
            amcattest.create_test_article(project=p, medium=m, create=False)
            for _x in range(1213)
        ]
        Article.create_articles(arts, s, create_id=True)
        ES().flush()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts, monitor=ProgressMonitor())
        ES().flush()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Beispiel #40
0
 def parse_file(self, file, _data):
     reader = self.get_reader(file)
     for unmapped_dict in reader:
         art_dict = self.map_article(unmapped_dict, dict(DEFAULTS))
         properties = {}
         for k, v in art_dict.items():
             v = parse_value(k, v)
             properties[k] = v
         yield Article.fromdict(properties)
Beispiel #41
0
def article_to_json(article: Article) -> Dict[str, Union[str, int, float, datetime.datetime]]:
    return {
        "title": article.title,
        "text": article.text,
        "hash": article.hash,
        "parent_hash": article.parent_hash,
        "url": article.url,
        "date": article.date,
        "properties": dict(article.get_properties())
    }
Beispiel #42
0
    def save(self, **kwargs):
        def _flatten(l):
            """Turn either an object or a (recursive/irregular/jagged) list-of-lists into a flat list"""
            # inspired by http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python
            if isinstance(l, collections.Iterable) and not isinstance(l, basestring):
                for el in l:
                    for sub in _flatten(el):
                        yield sub
            else:
                yield l

        # flatten articles list (children in a many call yields a list of lists)
        self.object = list(_flatten(self.object))

        Article.create_articles(self.object, self.context['view'].articleset)

        # make sure that self.many is True for serializing result
        self.many = True
        return self.object
Beispiel #43
0
    def run(self):
        upload = self.options['upload']
        upload.encoding_override(self.options['encoding'])

        monitor = self.progress_monitor

        root_dir = os.path.dirname(upload.filepath)

        monitor.update(10, u"Importing {self.__class__.__name__} from {upload.basename} into {self.project}"
                       .format(**locals()))

        articles = []
        files = self._get_files(upload)
        nfiles = len(upload)
        filemonitor = monitor.submonitor(nfiles, weight=60)
        for i, (file, data) in enumerate(files):
            filemonitor.update(1, "Parsing file {i}/{nfiles}: {file.name}".format(**locals()))
            articles += list(self.parse_file(file, data))

        for article in articles:
            _set_project(article, self.project)

        if self.errors:
            raise ParseError(" ".join(map(str, self.errors)))
        monitor.update(10, "All files parsed, saving {n} articles".format(n=len(articles)))
        Article.create_articles(articles, articleset=self.get_or_create_articleset(),
                                monitor=monitor.submonitor(40))

        if not articles:
            raise Exception("No articles were imported")

        monitor.update(10, "Uploaded {n} articles, post-processing".format(n=len(articles)))

        aset = self.options['articleset']
        new_provenance = self.get_provenance(upload.basename, articles)
        aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip()
        aset.save()

        if getattr(self, 'task', None):
            self.task.log_usage("articles", "upload", n=len(articles))

        monitor.update(10, "Done! Uploaded articles".format(n=len(articles)))
        return self.options["articleset"]
Beispiel #44
0
    def _run(self, local_project, remote_host, remote_token, remote_project_id, remote_articleset_id):
        try:
            page_size = 1000
            query = RemoteQuery(remote_host, remote_token, remote_project_id, remote_articleset_id, page_size=page_size)
            set = {k: v for k, v in query.get_articleset().items() if k in COPY_SET_FIELDS}
            set.update(project=local_project)
            set = ArticleSet.objects.create(**set)
            for page in query:
                articles_hashes = [(self.create_article(x, local_project), x["hash"]) for x in page]
                hashmap = {old_hash: article.hash for article, old_hash in articles_hashes}
                articles, _ = zip(*articles_hashes)
                articles = list(articles)
                for article in articles:
                    if article.parent_hash in hashmap:
                        article.parent_hash = hashmap[article.parent_hash]

                Article.create_articles(articles, articleset=set)
            return set.id
        except APIError as e:
            self.handleError(e)
Beispiel #45
0
    def parse_file(self, file):
        for doc in split_file(file):
            data = dict(parse_doc(doc))

            art = {}
            for field, setting in self.options['field_map'].items():
                value, typ = setting['value'], setting['type']
                val = data.get(value) if typ == 'field' else value
                if val:
                    art[field] = val
            yield Article(**art)
Beispiel #46
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "title": "\u6f22\u5b57",
            "text": "Even more strange characters.. \x0C and \x08 woo?",
            "url": "https://example.com",
            "project": create_test_project()
        })

        hash = get_article_dict(article)['hash']
        Article.create_articles([article], articleset=amcattest.create_test_set())
        ES().refresh()
        es_articles = ES().query_all(filters={"ids": [article.id]}, _source=["hash"])
        es_articles = list(es_articles)
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(hash, article.hash)
Beispiel #47
0
    def test_deduplication(self):
        """Does deduplication work as it is supposed to?"""

        # create dummy articles to have something in the db 
        [amcattest.create_test_article() for i in range(10)]
        amcates.ES().refresh()
        
        art = dict(project=amcattest.create_test_project(),
                   title="deduptest", text="test", date='2001-01-01')

        a1 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # duplicate articles should not be added
        a2 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(a2.id, a1.id)
        self.assertTrue(a2._duplicate)
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # however, if an articleset is given the 'existing' article
        # should be added to that set
        s1 = amcattest.create_test_set()
        a3 = amcattest.create_test_article(articleset=s1, **art)
        amcates.ES().refresh()
        self.assertEqual(a3.id, a1.id)
        self.assertEqual(_q(title='deduptest'), {a1.id})
        self.assertEqual(set(s1.get_article_ids()), {a1.id})
        self.assertEqual(_q(sets=s1.id), {a1.id})

        # if an existing hash is set, it should be correct
        art2 = dict(hash=b'hash', **art)
        self.assertRaises(ValueError, amcattest.create_test_article, **art2)

        #TODO! Check duplicates within new articles
        art['title'] = "internaldupe"
        a1, a2 = (Article(**art), Article(**art))
        Article.create_articles([a1, a2], articleset=s1)
        self.assertEqual(a1.id, a2.id)
        self.assertEqual(len(_q(title='internaldupe')), 1)
Beispiel #48
0
    def test_deduplication(self):
        """Does deduplication work as it is supposed to?"""

        # create dummy articles to have something in the db 
        [amcattest.create_test_article() for i in range(10)]
        amcates.ES().refresh()
        
        art = dict(project=amcattest.create_test_project(),
                   title="deduptest", text="test", date='2001-01-01')

        a1 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # duplicate articles should not be added
        a2 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(a2.id, a1.id)
        self.assertTrue(a2._duplicate)
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # however, if an articleset is given the 'existing' article
        # should be added to that set
        s1 = amcattest.create_test_set()
        a3 = amcattest.create_test_article(articleset=s1, **art)
        amcates.ES().refresh()
        self.assertEqual(a3.id, a1.id)
        self.assertEqual(_q(title='deduptest'), {a1.id})
        self.assertEqual(set(s1.get_article_ids()), {a1.id})
        self.assertEqual(_q(sets=s1.id), {a1.id})

        # if an existing hash is set, it should be correct
        art2 = dict(hash=b'hash', **art)
        self.assertRaises(ValueError, amcattest.create_test_article, **art2)

        #TODO! Check duplicates within new articles
        art['title'] = "internaldupe"
        a1, a2 = (Article(**art), Article(**art))
        Article.create_articles([a1, a2], articleset=s1)
        self.assertEqual(a1.id, a2.id)
        self.assertEqual(len(_q(title='internaldupe')), 1)
Beispiel #49
0
def article_to_json(
        article: Article
) -> Dict[str, Union[str, int, float, datetime.datetime]]:
    return {
        "title": article.title,
        "text": article.text,
        "hash": article.hash,
        "parent_hash": article.parent_hash,
        "url": article.url,
        "date": article.date,
        "properties": dict(article.get_properties())
    }
Beispiel #50
0
    def run(self, scraper):
        try:
            units = list(scraper._get_units())
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")
            return self.articles

        for i, unit in enumerate(units):
            try:
                articles = list(scraper._scrape_unit(unit))
            except Exception as e:
                log.exception("scraper._scrape_unit failed")
                self.errors.append(ScrapeError(i, unit, e))
                continue
            self.articles += articles

        for article in self.articles:
            _set_default(article, 'project', scraper.project)

        try:
            articles, errors = Article.create_articles(self.articles,
                                                       scraper.articleset)
            self.saved_article_ids = {a.id for a in self.articles}
            for e in errors:
                self.errors.append(ScrapeError(None, None, e))

            stats_log.info(
                json.dumps({
                    "action": "scraped_articles",
                    "narticles": len(self.saved_article_ids),
                    "scraper": scraper.__class__.__name__
                }))
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")

        # Do we need to save these id's to more sets?
        if hasattr(scraper, "articlesets") and hasattr(self,
                                                       "saved_article_ids"):
            for aset in scraper.articlesets:
                stats_log.info(
                    json.dumps({
                        "action": "add_scraped_articles",
                        "articleset_id": aset.id,
                        "articleset__name": aset.name,
                        "narticles": len(self.saved_article_ids),
                        "project_id": aset.project_id,
                        "project__name": aset.project.name
                    }))
                aset.add_articles(self.saved_article_ids)

        return getattr(self, "saved_article_ids", ())
Beispiel #51
0
    def create_article(self, art_dict, project):
        art_dict = {
            k: v
            for k, v in art_dict.items() if k in COPY_ARTICLE_FIELDS
        }
        art_dict["project"] = project
        if 'headline' in art_dict and 'title' not in art_dict:
            art_dict['title'] = art_dict.pop('headline')

        art_dict = dict(self._map_es_type(k, v) for k, v in art_dict.items())
        art = Article(**art_dict)
        return art
Beispiel #52
0
    def save(self, **kwargs):
        import collections

        def _flatten(l):
            """Turn either an object or a (recursive/irregular/jagged) list-of-lists into a flat list"""
            # inspired by http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python
            if isinstance(
                    l, collections.Iterable) and not isinstance(l, basestring):
                for el in l:
                    for sub in _flatten(el):
                        yield sub
            else:
                yield l

        # flatten articles list (children in a many call yields a list of lists)
        self.object = list(_flatten(self.object))

        Article.create_articles(self.object, self.context['view'].articleset)

        # make sure that self.many is True for serializing result
        self.many = True
        return self.object
Beispiel #53
0
def copy_article(article: Article):
    new = Article(
        project_id=article.project_id,
        date=article.date,
        title=article.title,
        url=article.url,
        #text=article.text <-- purposely omit text!
        #hash=article.hash <-- purposely omit hash!
        parent_hash=article.parent_hash)

    new.properties.update(article.properties)

    return new
Beispiel #54
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(
            **{
                "date": datetime.date(2015, 1, 1),
                "title": "\u6f22\u5b57",
                "text": "Even more strange characters.. \x0C and \x08 woo?",
                "url": "https://example.com",
                "project": create_test_project()
            })

        hash = get_article_dict(article)['hash']
        Article.create_articles([article],
                                articleset=amcattest.create_test_set())
        ES().refresh()
        es_articles = ES().query_all(filters={"ids": [article.id]},
                                     fields=["hash"])
        es_articles = list(es_articles)
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(hash, article.hash)
Beispiel #55
0
    def _scrape_unit(self, document):
        article = Article()
        metadata = list(META)

        # We select all 'div' elements directly under '.article'
        divs = document.cssselect("* > div")

        # Check for author field. If present: remove from metadata
        # fields list
        try:
            author_field = document.cssselect(".author")[0]
        except IndexError:
            pass
        else:
            article.author = author_field.text_content().lstrip("Von").strip()
            divs.remove(author_field)

        # Strip everything before headline
        headline_field = document.cssselect("b.deHeadline")[0].getparent()
        divs = divs[divs.index(headline_field):]

        # Parse metadata. Loop through each 'div' within an article, along with
        # its field name according to META (thus based on its position)
        for field_name, element in zip(metadata, divs):
            if field_name is None:
                continue

            processor = PROCESSORS.get(field_name, lambda x: x)
            text_content = element.text_content().strip()
            setattr(article, field_name, processor(text_content))

        # Fetch text, which is
        paragraphs = [p.text_content() for p in document.cssselect("p")]
        article.text = ("\n\n".join(paragraphs)).strip()

        # We must return a iterable, so we return a one-tuple
        return (article,)
Beispiel #56
0
    def set_up(self):
        self.aset = amcattest.create_test_set()
        self.asets = ArticleSet.objects.filter(id__in=[self.aset.id])
        self.project = self.aset.project

        self.a1 = Article(
            title="Man leeft nog steeds in de gloria",
            text="Gezongen vloek op verjaardag maakt leven van man tot een vrolijke hel.",
            date=datetime.datetime(2017, 1, 2, 23, 22, 11),
            author="Rudolf Julius",
            publisher="De Speld",
            project=self.project,
            exists="Once",
            page_int=5,
            section_int=10,
            tags_tag={"gloria", "vloek"},
            html="Man <i>leeft</i> nog steeds in de gloria"
        )

        self.a2 = Article(
            title="VVD trots op opkomende zon",
            text="Kabinetsbeleid om geen parasol over Nederland te zetten betaalt zich uit",
            date=datetime.datetime(2016, 12, 14, 15, 13, 12),
            author="Thomas Hogeling",
            publisher="De Speld",
            project=self.project,
            page_int=5,
            section_int=11,
            tags_tag={"vvd", "nederland", "speld"}
        )

        Article.create_articles([self.a1, self.a2], articleset=self.aset)

        amcates.ES().refresh()

        self.qs = ESQuerySet(self.asets)
Beispiel #57
0
    def from_field_name(cls, field_name: str, **kwargs):
        """Construct a category object corresponding to the field_name's type. For example,
        the field 'date' would map to a IntervalCategory, while author would map to
        TextCategory.

        @param kwargs: additional parameters passed to corresponding Category"""
        is_json_field = field_name not in Article.static_fields()
        field_type = get_property_primitive_type(field_name)

        if field_type in (int, str, float):
            return ArticleFieldCategory(is_json_field=is_json_field, field_name=field_name, **kwargs)
        elif field_type == datetime.datetime:
            return IntervalCategory(is_json_field=is_json_field, field_name=field_name, **kwargs)
        else:
            raise ValueError("Did not recognize primitive field type: {} (on {})".format(field_type, field_name))