Esempio n. 1
0
    def test_get_articles(self):
        from amcat.models import Sentence
        _get_articles = lambda a,s : list(get_articles(a,s))

        # Should raise exception if sentences not in article
        article, sentences = self.create_test_sentences()
        s1 = Sentence.objects.filter(id=amcattest.create_test_sentence().id)
        self.assertRaises(ValueError, _get_articles, article, s1)

        # Should raise an exception if we try to split on headline
        self.assertRaises(ValueError, _get_articles, article, sentences.filter(parnr=1))

        # Should return a "copy", with byline in "text" property
        arts = _get_articles(article, Sentence.objects.none())
        Article.create_articles(arts)

        self.assertEquals(len(arts), 1)
        sbd.create_sentences(arts[0])

        self.assertEquals(
            [s.sentence for s in sentences[1:]],
            [s.sentence for s in arts[0].sentences.all()[1:]]
        )

        self.assertTrue("foo" in arts[0].text)

        # Should be able to split on byline
        self.assertEquals(2, len(_get_articles(article, sentences[1:2])))
        a, b = _get_articles(article, sentences[4:5])

        # Check if text on splitted articles contains expected
        self.assertTrue("Einde" not in a.text)
        self.assertTrue("Einde" in b.text)
Esempio n. 2
0
    def _run(self, local_project, remote_host, remote_token, remote_project_id,
             remote_articleset_id):
        try:
            page_size = 1000
            query = RemoteQuery(remote_host,
                                remote_token,
                                remote_project_id,
                                remote_articleset_id,
                                page_size=page_size)
            set = {
                k: v
                for k, v in query.get_articleset().items()
                if k in COPY_SET_FIELDS
            }
            set.update(project=local_project)
            set = ArticleSet.objects.create(**set)
            for page in query:
                articles_hashes = [(self.create_article(x, local_project),
                                    x["hash"]) for x in page]
                hashmap = {
                    old_hash: article.hash
                    for article, old_hash in articles_hashes
                }
                articles, _ = zip(*articles_hashes)
                articles = list(articles)
                for article in articles:
                    if article.parent_hash in hashmap:
                        article.parent_hash = hashmap[article.parent_hash]

                Article.create_articles(articles, articleset=set)
            return set.id
        except APIError as e:
            self.handleError(e)
Esempio n. 3
0
    def test_list_media(self):
        """Test that list media works for more than 10 media"""
        from amcat.models import Article
        media = [amcattest.create_test_medium() for _ in range(20)]
        arts = [
            amcattest.create_test_article(medium=m, create=False)
            for m in media
        ]

        s1 = amcattest.create_test_set()
        Article.create_articles(arts[:5],
                                articleset=s1,
                                check_duplicate=False,
                                create_id=True)
        ES().flush()
        self.assertEqual(set(s1.get_mediums()), set(media[:5]))

        s2 = amcattest.create_test_set(project=s1.project)
        Article.create_articles(arts[5:],
                                articleset=s2,
                                check_duplicate=False,
                                create_id=True)
        ES().flush()
        self.assertEqual(set(s2.get_mediums()), set(media[5:]))

        self.assertEqual(set(s1.project.get_mediums()), set(media))
Esempio n. 4
0
    def test_aggregate(self):
        """Can we make tables per medium/date interval?"""
        from amcat.models import Article
        m1 = amcattest.create_test_medium(name="De Nep-Krant")
        m2, m3 = [amcattest.create_test_medium() for _ in range(2)]
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        unused = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2)
        a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False)
        b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False)
        d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False)

        Article.create_articles([a,b,c,d], articleset=s1, check_duplicate=False, create_id=True)
        ES().flush()

        self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="mediumid")),
                         {m1.id : 1, m2.id : 3})

        self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")),
                         {datetime(2001,1,1) : 3, datetime(2002,1,1) : 1})

        self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")),
                         {datetime(2001,1,1) : 1, datetime(2002,1,1) : 1, datetime(2001,2,1) : 2})

        # set statistics
        stats = ES().statistics(filters=dict(sets=s1.id))
        self.assertEqual(stats.n, 4)
        self.assertEqual(stats.start_date, datetime(2001,1,1))
        self.assertEqual(stats.end_date, datetime(2002,1,1))

        # media list
        self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))),
                         {m1.id, m2.id})
Esempio n. 5
0
    def setup(self):
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies',
                                          title='m1',
                                          date='2001-01-01',
                                          create=False)
        b = amcattest.create_test_article(text='noot mies wim zus',
                                          title='m2',
                                          date='2001-02-01',
                                          create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet',
                                          title='m2',
                                          date='2002-01-01',
                                          create=False)
        d = amcattest.create_test_article(text='noot mies wim zus',
                                          title='m2',
                                          date='2001-02-03',
                                          create=False)
        e = amcattest.create_test_article(text='aap noot mies',
                                          title='m3',
                                          articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1)
        ES().refresh()
        return s1, s2, a, b, c, d, e
Esempio n. 6
0
 def test_create(self):
     """Can we create/store/index an article object?"""
     a = amcattest.create_test_article(create=False, date='2010-12-31', headline=u'\ua000abcd\u07b4')
     Article.create_articles([a], create_id=True)
     db_a = Article.objects.get(pk=a.id)
     amcates.ES().flush()
     es_a = list(amcates.ES().query(filters={'ids': [a.id]}, fields=["date", "headline"]))[0]
     self.assertEqual(a.headline, db_a.headline)
     self.assertEqual(a.headline, es_a.headline)
     self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat())
     self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
Esempio n. 7
0
 def test_create_order(self):
     """Is insert order preserved in id order?"""
     articles = [amcattest.create_test_article(create=False) for _i in range(25)]
     random.shuffle(articles)
     Article.create_articles(articles)
     ids = [a.id for a in articles]
     # is order preserved?
     self.assertEqual(ids, sorted(ids))
     # do the right articles have the right title?
     for saved in articles:
         indb = Article.objects.get(pk=saved.id)
         self.assertEqual(indb.title, saved.title)
Esempio n. 8
0
 def test_create_order(self):
     """Is insert order preserved in id order?"""
     articles = [amcattest.create_test_article(create=False) for _i in range(25)]
     random.shuffle(articles)
     Article.create_articles(articles)
     ids = [a.id for a in articles]
     # is order preserved?
     self.assertEqual(ids, sorted(ids))
     # do the right articles have the right title?
     for saved in articles:
         indb = Article.objects.get(pk=saved.id)
         self.assertEqual(indb.title, saved.title)
Esempio n. 9
0
    def create(self, validated_data):
        articleset = self.get_articleset()

        if 'id' in validated_data:
            _check_read_access(self.context['request'].user, [validated_data['id']])
            article = Article.objects.get(pk=validated_data['id'])
            articleset.add_articles([article])
        else:
            article = json_to_article(validated_data, articleset.project)
            Article.create_articles([article], articleset=articleset)

        return article
Esempio n. 10
0
    def setup(self):
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies', title='m1', date='2001-01-01', create=False)
        b = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-01', create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2', date='2002-01-01', create=False)
        d = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-03', create=False)
        e = amcattest.create_test_article(text='aap noot mies', title='m3', articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1)
        ES().refresh()
        return s1, s2, a, b, c, d, e
Esempio n. 11
0
 def test_create(self):
     """Can we create/store/index an article object?"""
     a = amcattest.create_test_article(create=False, date='2010-12-31', title=u'\ua000abcd\u07b4')
     Article.create_articles([a])
     db_a = Article.objects.get(pk=a.id)
     amcates.ES().refresh()
     es_a = list(amcates.ES().query(filters={'ids': [a.id]}, _source=["date", "title", "hash"]))[0]
     self.assertEqual(a.hash, db_a.hash)
     self.assertEqual(a.hash, es_a.hash)
     self.assertEqual(a.title, db_a.title)
     self.assertEqual(a.title, es_a.title)
     self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat())
     self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
Esempio n. 12
0
    def test_query_all(self):
        """Test that query_all works"""
        from amcat.models import Article
        arts = [amcattest.create_test_article(create=False) for _ in range(20)]
        s = amcattest.create_test_set()
        Article.create_articles(arts, articleset=s, check_duplicate=False, create_id=True)
        ES().flush()

        r = ES().query(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), 10)

        r = ES().query_all(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), len(arts))
Esempio n. 13
0
    def create(self, validated_data):
        articleset = self.get_articleset()

        if 'id' in validated_data:
            _check_read_access(self.context['request'].user,
                               [validated_data['id']])
            article = Article.objects.get(pk=validated_data['id'])
            articleset.add_articles([article])
        else:
            article = json_to_article(validated_data, articleset.project)
            Article.create_articles([article], articleset=articleset)

        return article
Esempio n. 14
0
    def test_query_all(self):
        """Test that query_all works"""
        from amcat.models import Article
        arts = [amcattest.create_test_article(create=False) for _ in range(20)]
        s = amcattest.create_test_set()
        Article.create_articles(arts, articleset=s, check_duplicate=False)
        ES().flush()

        r = ES().query(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), 10)

        r = ES().query_all(filters=dict(sets=s.id), size=10)
        self.assertEqual(len(list(r)), len(arts))
Esempio n. 15
0
    def setup(self):
        m1 = amcattest.create_test_medium(name="De Nep-Krant")
        m2, m3 = [amcattest.create_test_medium() for _ in range(2)]
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False)
        b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False)
        d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False)
        e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1, check_duplicate=False, create_id=True)
        ES().flush()
        return m1, m2, m3, s1, s2, a, b, c, d, e
Esempio n. 16
0
    def setup(self):
        m1 = amcattest.create_test_medium(name="De Nep-Krant")
        m2, m3 = [amcattest.create_test_medium() for _ in range(2)]
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False)
        b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False)
        d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False)
        e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2)

        Article.create_articles([a, b, c, d], articleset=s1)
        ES().flush()
        return m1, m2, m3, s1, s2, a, b, c, d, e
Esempio n. 17
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        p = amcattest.create_test_project()

        arts = [amcattest.create_test_article(project=p, create=False) for _x in range(1213)]
        Article.create_articles(arts, s)
        ES().refresh()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts)
        ES().refresh()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Esempio n. 18
0
    def run(self):
        monitor = self.progress_monitor

        filename = self.options['filename']
        file_shortname = os.path.split(self.options['filename'])[-1]
        monitor.update(
            10,
            u"Importing {self.__class__.__name__} from {file_shortname} into {self.project}"
            .format(**locals()))

        articles = []
        encoding = self.options['encoding']
        files = list(self._get_files(filename, encoding))
        nfiles = len(files)
        for i, (file, encoding, data) in enumerate(files):
            monitor.update(
                20 / nfiles,
                "Parsing file {i}/{nfiles}: {file}".format(**locals()))
            articles += list(self.parse_file(file, encoding, data))

        for article in articles:
            _set_project(article, self.project)

        if self.errors:
            raise ParseError(" ".join(map(str, self.errors)))
        monitor.update(
            10,
            "All files parsed, saving {n} articles".format(n=len(articles)))
        Article.create_articles(articles,
                                articleset=self.get_or_create_articleset(),
                                monitor=monitor.submonitor(40))

        if not articles:
            raise Exception("No articles were imported")

        monitor.update(
            10,
            "Uploaded {n} articles, post-processing".format(n=len(articles)))

        aset = self.options["articleset"]
        new_provenance = self.get_provenance(file, articles)
        aset.provenance = ("%s\n%s" %
                           (aset.provenance or "", new_provenance)).strip()
        aset.save()

        if getattr(self, 'task', None):
            self.task.log_usage("articles", "upload", n=len(articles))

        monitor.update(10, "Done! Uploaded articles".format(n=len(articles)))
        return self.options["articleset"]
Esempio n. 19
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        m = amcattest.create_test_medium()
        p = amcattest.create_test_project()

        arts = [amcattest.create_test_article(project=p, medium=m, create=False) for _x in range(1213)]
        Article.create_articles(arts, s, create_id=True)
        ES().flush()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts, monitor=ProgressMonitor())
        ES().flush()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Esempio n. 20
0
def handle_split(form, project, article, sentences):
    articles = list(get_articles(article, sentences))

    # We won't use bulk_create yet, as it bypasses save() and doesn't
    # insert ids
    Article.create_articles(articles)
    for art in articles:
        sbd.get_or_create_sentences(art)

    if not form.is_valid():
        raise ValueError("Form invalid: {form.errors}".format(**locals()))

    # Context variables for template
    form_data = form.cleaned_data
    all_sets = list(project.all_articlesets().filter(articles=article))

    # Add splitted articles to existing sets
    for aset in form_data["add_splitted_to_sets"]:
        aset.add_articles(articles)

    # Add splitted articles to sets wherin the original article live{d,s}
    if form_data["add_splitted_to_all"]:
        asets = project.all_articlesets().filter(articles=article).only("id")
        for aset in asets:
            aset.add_articles(articles)

    if form_data["remove_from_sets"]:
        for aset in form_data["remove_from_sets"]:
            aset.remove_articles([article])

    if form_data["remove_from_all_sets"]:
        for aset in ArticleSet.objects.filter(project=project,
                                              articles=article).distinct():
            aset.remove_articles([article])

    if form_data["add_splitted_to_new_set"]:
        new_splitted_set = ArticleSet.create_set(
            project, form_data["add_splitted_to_new_set"], articles)

    if form_data["add_to_sets"]:
        for articleset in form_data["add_to_sets"]:
            articleset.add_articles([article])

    if form_data["add_to_new_set"]:
        new_set = ArticleSet.create_set(project, form_data["add_to_new_set"],
                                        [article])

    return locals()
    def run(self, scraper):
        try:
            units = list(scraper._get_units())
        except Exception as e:
            self.errors.append(ScrapeError(None,None,e))
            log.exception("scraper._get_units failed")
            return self.articles

        for i, unit in enumerate(units):
            try:
                articles = list(scraper._scrape_unit(unit))
            except Exception as e:
                log.exception("scraper._scrape_unit failed")
                self.errors.append(ScrapeError(i,unit,e))
                continue
            self.articles += articles

        for article in self.articles:
            _set_default(article, 'project', scraper.project)

        try:
            articles, errors = Article.create_articles(self.articles, scraper.articleset)
            self.saved_article_ids = {getattr(a, "duplicate_of", a.id) for a in self.articles}
            for e in errors:
                self.errors.append(ScrapeError(None,None,e))
        except Exception as e:
            self.errors.append(ScrapeError(None,None,e))
	    print e
            log.exception("scraper._get_units failed")

        return self.saved_article_ids
Esempio n. 22
0
    def run(self, scraper):
        try:
            units = list(scraper._get_units())
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")
            return self.articles

        for i, unit in enumerate(units):
            try:
                articles = list(scraper._scrape_unit(unit))
            except Exception as e:
                log.exception("scraper._scrape_unit failed")
                self.errors.append(ScrapeError(i, unit, e))
                continue
            self.articles += articles

        for article in self.articles:
            _set_default(article, 'project', scraper.project)

        try:
            articles, errors = Article.create_articles(self.articles,
                                                       scraper.articleset)
            self.saved_article_ids = {
                getattr(a, "duplicate_of", a.id)
                for a in self.articles
            }
            for e in errors:
                self.errors.append(ScrapeError(None, None, e))
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")

        return self.saved_article_ids
Esempio n. 23
0
    def test_list_media(self):
        """Test that list media works for more than 10 media"""
        from amcat.models import Article
        media =  [amcattest.create_test_medium() for _ in range(20)]
        arts = [amcattest.create_test_article(medium=m, create=False) for m in media]

        s1 = amcattest.create_test_set()
        Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True)
        ES().flush()
        self.assertEqual(set(s1.get_mediums()), set(media[:5]))

        s2 = amcattest.create_test_set(project=s1.project)
        Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True)
        ES().flush()
        self.assertEqual(set(s2.get_mediums()), set(media[5:]))

        self.assertEqual(set(s1.project.get_mediums()), set(media))
Esempio n. 24
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        p = amcattest.create_test_project()

        arts = [
            amcattest.create_test_article(project=p, create=False)
            for _x in range(1213)
        ]
        Article.create_articles(arts, s)
        ES().refresh()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts)
        ES().refresh()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Esempio n. 25
0
 def create_articles(batch):
     for a in batch:
         a['oldid_int'] = a.pop('old_id')
         if a['text'] == '': a['text'] = '-'
         if a['title'] == '': a['title'] = '-'
     articles = Article.create_articles([Article(project_id=self.status.project.id, **a) for a in batch])
     self.status.articles.update({a.get_property('oldid_int'): a.id for a in articles})
     return articles
Esempio n. 26
0
def handle_split(form, project, article, sentences):
    articles = list(get_articles(article, sentences))

    # We won't use bulk_create yet, as it bypasses save() and doesn't
    # insert ids
    Article.create_articles(articles)
    for art in articles:
        sbd.get_or_create_sentences(art)

    if not form.is_valid():
        raise ValueError("Form invalid: {form.errors}".format(**locals()))

    # Context variables for template
    form_data = form.cleaned_data
    all_sets = list(project.all_articlesets().filter(articles=article))

    # Add splitted articles to existing sets
    for aset in form_data["add_splitted_to_sets"]:
        aset.add_articles(articles)

    # Add splitted articles to sets wherin the original article live{d,s}
    if form_data["add_splitted_to_all"]:
        asets = project.all_articlesets().filter(articles=article).only("id")
        for aset in asets:
            aset.add_articles(articles)

    if form_data["remove_from_sets"]:
        for aset in form_data["remove_from_sets"]:
            aset.remove_articles([article])

    if form_data["remove_from_all_sets"]:
        for aset in ArticleSet.objects.filter(project=project, articles=article).distinct():
            aset.remove_articles([article])

    if form_data["add_splitted_to_new_set"]:
        new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles)

    if form_data["add_to_sets"]:
        for articleset in form_data["add_to_sets"]:
            articleset.add_articles([article])

    if form_data["add_to_new_set"]:
        new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article])

    return locals()
Esempio n. 27
0
    def test_add_many(self):
        """Can we add a large number of articles from one set to another?"""
        s = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        m = amcattest.create_test_medium()
        p = amcattest.create_test_project()

        arts = [
            amcattest.create_test_article(project=p, medium=m, create=False)
            for _x in range(1213)
        ]
        Article.create_articles(arts, s, create_id=True)
        ES().flush()
        self.assertEqual(len(arts), s.get_count())
        s2.add_articles(arts, monitor=ProgressMonitor())
        ES().flush()
        self.assertEqual(len(arts), s2.get_count())
        print(s2.get_count())
Esempio n. 28
0
    def run(self):
        upload = self.options['upload']
        upload.encoding_override(self.options['encoding'])

        monitor = self.progress_monitor

        root_dir = os.path.dirname(upload.filepath)

        monitor.update(10, u"Importing {self.__class__.__name__} from {upload.basename} into {self.project}"
                       .format(**locals()))

        articles = []
        files = self._get_files(upload)
        nfiles = len(upload)
        filemonitor = monitor.submonitor(nfiles, weight=60)
        for i, (file, data) in enumerate(files):
            filemonitor.update(1, "Parsing file {i}/{nfiles}: {file.name}".format(**locals()))
            articles += list(self.parse_file(file, data))

        for article in articles:
            _set_project(article, self.project)

        if self.errors:
            raise ParseError(" ".join(map(str, self.errors)))
        monitor.update(10, "All files parsed, saving {n} articles".format(n=len(articles)))
        Article.create_articles(articles, articleset=self.get_or_create_articleset(),
                                monitor=monitor.submonitor(40))

        if not articles:
            raise Exception("No articles were imported")

        monitor.update(10, "Uploaded {n} articles, post-processing".format(n=len(articles)))

        aset = self.options['articleset']
        new_provenance = self.get_provenance(upload.basename, articles)
        aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip()
        aset.save()

        if getattr(self, 'task', None):
            self.task.log_usage("articles", "upload", n=len(articles))

        monitor.update(10, "Done! Uploaded articles".format(n=len(articles)))
        return self.options["articleset"]
Esempio n. 29
0
    def save(self, **kwargs):
        def _flatten(l):
            """Turn either an object or a (recursive/irregular/jagged) list-of-lists into a flat list"""
            # inspired by http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python
            if isinstance(l, collections.Iterable) and not isinstance(l, basestring):
                for el in l:
                    for sub in _flatten(el):
                        yield sub
            else:
                yield l

        # flatten articles list (children in a many call yields a list of lists)
        self.object = list(_flatten(self.object))

        Article.create_articles(self.object, self.context['view'].articleset)

        # make sure that self.many is True for serializing result
        self.many = True
        return self.object
Esempio n. 30
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "title": "\u6f22\u5b57",
            "text": "Even more strange characters.. \x0C and \x08 woo?",
            "url": "https://example.com",
            "project": create_test_project()
        })

        hash = get_article_dict(article)['hash']
        Article.create_articles([article], articleset=amcattest.create_test_set())
        ES().refresh()
        es_articles = ES().query_all(filters={"ids": [article.id]}, _source=["hash"])
        es_articles = list(es_articles)
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(hash, article.hash)
Esempio n. 31
0
    def _run(self, local_project, remote_host, remote_token, remote_project_id, remote_articleset_id):
        try:
            page_size = 1000
            query = RemoteQuery(remote_host, remote_token, remote_project_id, remote_articleset_id, page_size=page_size)
            set = {k: v for k, v in query.get_articleset().items() if k in COPY_SET_FIELDS}
            set.update(project=local_project)
            set = ArticleSet.objects.create(**set)
            for page in query:
                articles_hashes = [(self.create_article(x, local_project), x["hash"]) for x in page]
                hashmap = {old_hash: article.hash for article, old_hash in articles_hashes}
                articles, _ = zip(*articles_hashes)
                articles = list(articles)
                for article in articles:
                    if article.parent_hash in hashmap:
                        article.parent_hash = hashmap[article.parent_hash]

                Article.create_articles(articles, articleset=set)
            return set.id
        except APIError as e:
            self.handleError(e)
Esempio n. 32
0
    def test_deduplication(self):
        """Does deduplication work as it is supposed to?"""

        # create dummy articles to have something in the db 
        [amcattest.create_test_article() for i in range(10)]
        amcates.ES().refresh()
        
        art = dict(project=amcattest.create_test_project(),
                   title="deduptest", text="test", date='2001-01-01')

        a1 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # duplicate articles should not be added
        a2 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(a2.id, a1.id)
        self.assertTrue(a2._duplicate)
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # however, if an articleset is given the 'existing' article
        # should be added to that set
        s1 = amcattest.create_test_set()
        a3 = amcattest.create_test_article(articleset=s1, **art)
        amcates.ES().refresh()
        self.assertEqual(a3.id, a1.id)
        self.assertEqual(_q(title='deduptest'), {a1.id})
        self.assertEqual(set(s1.get_article_ids()), {a1.id})
        self.assertEqual(_q(sets=s1.id), {a1.id})

        # if an existing hash is set, it should be correct
        art2 = dict(hash=b'hash', **art)
        self.assertRaises(ValueError, amcattest.create_test_article, **art2)

        #TODO! Check duplicates within new articles
        art['title'] = "internaldupe"
        a1, a2 = (Article(**art), Article(**art))
        Article.create_articles([a1, a2], articleset=s1)
        self.assertEqual(a1.id, a2.id)
        self.assertEqual(len(_q(title='internaldupe')), 1)
Esempio n. 33
0
    def test_deduplication(self):
        """Does deduplication work as it is supposed to?"""

        # create dummy articles to have something in the db 
        [amcattest.create_test_article() for i in range(10)]
        amcates.ES().refresh()
        
        art = dict(project=amcattest.create_test_project(),
                   title="deduptest", text="test", date='2001-01-01')

        a1 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # duplicate articles should not be added
        a2 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(a2.id, a1.id)
        self.assertTrue(a2._duplicate)
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # however, if an articleset is given the 'existing' article
        # should be added to that set
        s1 = amcattest.create_test_set()
        a3 = amcattest.create_test_article(articleset=s1, **art)
        amcates.ES().refresh()
        self.assertEqual(a3.id, a1.id)
        self.assertEqual(_q(title='deduptest'), {a1.id})
        self.assertEqual(set(s1.get_article_ids()), {a1.id})
        self.assertEqual(_q(sets=s1.id), {a1.id})

        # if an existing hash is set, it should be correct
        art2 = dict(hash=b'hash', **art)
        self.assertRaises(ValueError, amcattest.create_test_article, **art2)

        #TODO! Check duplicates within new articles
        art['title'] = "internaldupe"
        a1, a2 = (Article(**art), Article(**art))
        Article.create_articles([a1, a2], articleset=s1)
        self.assertEqual(a1.id, a2.id)
        self.assertEqual(len(_q(title='internaldupe')), 1)
Esempio n. 34
0
    def run(self, scraper):
        try:
            units = list(scraper._get_units())
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")
            return self.articles

        for i, unit in enumerate(units):
            try:
                articles = list(scraper._scrape_unit(unit))
            except Exception as e:
                log.exception("scraper._scrape_unit failed")
                self.errors.append(ScrapeError(i, unit, e))
                continue
            self.articles += articles

        for article in self.articles:
            _set_default(article, 'project', scraper.project)

        try:
            articles, errors = Article.create_articles(self.articles,
                                                       scraper.articleset)
            self.saved_article_ids = {a.id for a in self.articles}
            for e in errors:
                self.errors.append(ScrapeError(None, None, e))

            stats_log.info(
                json.dumps({
                    "action": "scraped_articles",
                    "narticles": len(self.saved_article_ids),
                    "scraper": scraper.__class__.__name__
                }))
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")

        # Do we need to save these id's to more sets?
        if hasattr(scraper, "articlesets") and hasattr(self,
                                                       "saved_article_ids"):
            for aset in scraper.articlesets:
                stats_log.info(
                    json.dumps({
                        "action": "add_scraped_articles",
                        "articleset_id": aset.id,
                        "articleset__name": aset.name,
                        "narticles": len(self.saved_article_ids),
                        "project_id": aset.project_id,
                        "project__name": aset.project.name
                    }))
                aset.add_articles(self.saved_article_ids)

        return getattr(self, "saved_article_ids", ())
Esempio n. 35
0
    def save(self, **kwargs):
        import collections

        def _flatten(l):
            """Turn either an object or a (recursive/irregular/jagged) list-of-lists into a flat list"""
            # inspired by http://stackoverflow.com/questions/2158395/flatten-an-irregular-list-of-lists-in-python
            if isinstance(
                    l, collections.Iterable) and not isinstance(l, basestring):
                for el in l:
                    for sub in _flatten(el):
                        yield sub
            else:
                yield l

        # flatten articles list (children in a many call yields a list of lists)
        self.object = list(_flatten(self.object))

        Article.create_articles(self.object, self.context['view'].articleset)

        # make sure that self.many is True for serializing result
        self.many = True
        return self.object
Esempio n. 36
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(
            **{
                "date": datetime.date(2015, 1, 1),
                "title": "\u6f22\u5b57",
                "text": "Even more strange characters.. \x0C and \x08 woo?",
                "url": "https://example.com",
                "project": create_test_project()
            })

        hash = get_article_dict(article)['hash']
        Article.create_articles([article],
                                articleset=amcattest.create_test_set())
        ES().refresh()
        es_articles = ES().query_all(filters={"ids": [article.id]},
                                     fields=["hash"])
        es_articles = list(es_articles)
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(hash, article.hash)
Esempio n. 37
0
    def set_up(self):
        self.aset = amcattest.create_test_set()
        self.asets = ArticleSet.objects.filter(id__in=[self.aset.id])
        self.project = self.aset.project

        self.a1 = Article(
            title="Man leeft nog steeds in de gloria",
            text="Gezongen vloek op verjaardag maakt leven van man tot een vrolijke hel.",
            date=datetime.datetime(2017, 1, 2, 23, 22, 11),
            author="Rudolf Julius",
            publisher="De Speld",
            project=self.project,
            exists="Once",
            page_int=5,
            section_int=10,
            tags_tag={"gloria", "vloek"},
            html="Man <i>leeft</i> nog steeds in de gloria"
        )

        self.a2 = Article(
            title="VVD trots op opkomende zon",
            text="Kabinetsbeleid om geen parasol over Nederland te zetten betaalt zich uit",
            date=datetime.datetime(2016, 12, 14, 15, 13, 12),
            author="Thomas Hogeling",
            publisher="De Speld",
            project=self.project,
            page_int=5,
            section_int=11,
            tags_tag={"vvd", "nederland", "speld"}
        )

        Article.create_articles([self.a1, self.a2], articleset=self.aset)

        amcates.ES().refresh()

        self.qs = ESQuerySet(self.asets)
Esempio n. 38
0
    def run(self, scraper):
        try:
            units = list(scraper._get_units())
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")
            return self.articles

        for i, unit in enumerate(units):
            try:
                articles = list(scraper._scrape_unit(unit))
            except Exception as e:
                log.exception("scraper._scrape_unit failed")
                self.errors.append(ScrapeError(i, unit, e))
                continue
            self.articles += articles

        for article in self.articles:
            _set_default(article, 'project', scraper.project)

        try:
            articles, errors = Article.create_articles(self.articles, scraper.articleset)
            self.saved_article_ids = {a.id for a in self.articles}
            for e in errors:
                self.errors.append(ScrapeError(None, None, e))

            stats_log.info(json.dumps({
                "action": "scraped_articles", "narticles": len(self.saved_article_ids),
                "scraper": scraper.__class__.__name__
            }))
        except Exception as e:
            self.errors.append(ScrapeError(None, None, e))
            log.exception("scraper._get_units failed")

        # Do we need to save these id's to more sets?
        if hasattr(scraper, "articlesets") and hasattr(self, "saved_article_ids"):
            for aset in scraper.articlesets:
                stats_log.info(json.dumps({
                    "action": "add_scraped_articles", "articleset_id": aset.id,
                    "articleset__name": aset.name, "narticles": len(self.saved_article_ids),
                    "project_id": aset.project_id, "project__name": aset.project.name
                }))
                aset.add_articles(self.saved_article_ids)

        return getattr(self, "saved_article_ids", ())
Esempio n. 39
0
    def create(self, validated_data):
        # Get articleset object given through URL
        articleset_id = self.context["view"].kwargs.get('articleset')
        if articleset_id is not None:
            articleset = ArticleSet.objects.get(pk=articleset_id)
            project = articleset.project
        else:
            raise ValueError("Missing articleset parameter?")

        # Create articles not yet in database
        new_articles = [a for a in validated_data if "id" not in a]
        if new_articles:
            new_articles = [json_to_article(article, project) for article in new_articles]
            yield from Article.create_articles(new_articles, articleset=articleset)

        # Add existing articles to this set
        to_add = [a['id'] for a in validated_data if "id" in a]
        if to_add:
            _check_read_access(self.context['request'].user, to_add)
            articleset.add_articles(to_add)
            yield from Article.objects.filter(pk__in=to_add).only("pk")
Esempio n. 40
0
    def create(self, validated_data):
        # Get articleset object given through URL
        articleset_id = self.context["view"].kwargs.get('articleset')
        if articleset_id is not None:
            articleset = ArticleSet.objects.get(pk=articleset_id)
            project = articleset.project
        else:
            raise ValueError("Missing articleset parameter?")

        # Create articles not yet in database
        new_articles = [a for a in validated_data if "id" not in a]
        if new_articles:
            new_articles = [
                json_to_article(article, project) for article in new_articles
            ]
            yield from Article.create_articles(new_articles,
                                               articleset=articleset)

        # Add existing articles to this set
        to_add = [a['id'] for a in validated_data if "id" in a]
        if to_add:
            _check_read_access(self.context['request'].user, to_add)
            articleset.add_articles(to_add)
            yield from Article.objects.filter(pk__in=to_add).only("pk")
Esempio n. 41
0
    def test_highlight_fragments(self):
        self.set_up()

        articleset = amcattest.create_test_set()
        project = articleset.project

        text = """
        The Alderman Proctor's Drinking Fountain (grid reference ST566738) is a historic building
        on Clifton Down, Bristol, England.

        The city of Bristol began supplying municipal drinking water in 1858. To inform the public
        about the new water supply, Robert Lang made a proposal though the Bristol Times that public
        drinking fountains be constructed. Lang began the "Fountain Fund" in January 1859 with a
        donation of one hundred pounds. By 1906, there were more than 40 public drinking fountains
        throughout the city.

        In 1872, Alderman Thomas Proctor commissioned the firm of George and Henry Godwin to build
        the fountain to commemorate the 1861 presentation of <i>Clifton Down</i> to the City of
        Bristol by the Society of Merchant Venturers.

        **Commemorative plaque**

        The three-sided fountain is done in Gothic Revival style. The main portion is of limestone
        with pink marble columns and white marble surround. The commemorative plaque is of black
        lettering on white marble; the plaque reads, "Erected by Alderman Thomas Proctor, of Bristol
        to record the liberal gift of certain rights on Clifton Down made to the citizens by the
        Society of Merchant Venturers under the provision of the Clifton and Drudham Downs Acts
        of Parliament, 1861, whereby the enjoyment of these Downs is preserved to the citizens of
        Bristol for ever." The fountain bears the coat of arms for the city of Bristol, the Society
        of Merchant Venturers and that of Alderman Thomas Proctor.

        The fountain was originally situated at the head of Bridge Valley Road. It became a sight
        impediment to modern auto traffic in the later 20th century. The fountain was moved to the
        other side of the road, closer to the Mansion House in 1987. After the move, it underwent
        restoration and was re-dedicated on 1 May 1988. It has been designated by English Heritage
        as a grade II listed building since 1977.
        """

        paragraphs = [" ".join(s.strip() for s in p.strip().split("\n")) for p in text.split("\n\n")]

        long_article = Article(
            title="Alderman Proctor's Drinking Fountain",
            text="\n\n".join(paragraphs).strip(),
            date=datetime.datetime(2017, 1, 18, 13, 29, 11),
            url="https://en.wikipedia.org/wiki/Alderman_Proctor%27s_Drinking_Fountain",
            publisher="Wikipedia",
            project=project
        )

        Article.create_articles([long_article], articleset)
        amcates.ES().refresh()

        qs = ESQuerySet(ArticleSet.objects.filter(id=articleset.id))
        fragments = qs.highlight_fragments('"Clifton Down"', ("text", "title"), fragment_size=50)

        self.assertEqual(1, len(qs))
        self.assertEqual(1, len(fragments))

        fragments = next(iter(fragments.values()))
        text_fragments = set(fragments["text"])
        title_fragments = fragments["title"]

        self.assertEqual(1, len(title_fragments))
        self.assertNotIn("<mark>", title_fragments[0])
        self.assertEqual(3, len(text_fragments))
        self.assertEqual(text_fragments, {
             ' presentation of &lt;i&gt;<mark>Clifton</mark> <mark>Down</mark>&lt;/i&gt; to the City of Bristol',
             ' <mark>Clifton</mark> <mark>Down</mark>, Bristol, England.\n\nThe city of Bristol',
             ' the liberal gift of certain rights on <mark>Clifton</mark> <mark>Down</mark> made'
        })
Esempio n. 42
0
    def test_aggregate(self):
        """Can we make tables per medium/date interval?"""
        from amcat.models import Article
        m1 = amcattest.create_test_medium(name="De Nep-Krant")
        m2, m3 = [amcattest.create_test_medium() for _ in range(2)]
        s1 = amcattest.create_test_set()
        s2 = amcattest.create_test_set()
        unused = amcattest.create_test_article(text='aap noot mies',
                                               medium=m3,
                                               articleset=s2)
        a = amcattest.create_test_article(text='aap noot mies',
                                          medium=m1,
                                          date='2001-01-01',
                                          create=False)
        b = amcattest.create_test_article(text='noot mies wim zus',
                                          medium=m2,
                                          date='2001-02-01',
                                          create=False)
        c = amcattest.create_test_article(text='mies bla bla bla wim zus jet',
                                          medium=m2,
                                          date='2002-01-01',
                                          create=False)
        d = amcattest.create_test_article(text='noot mies wim zus',
                                          medium=m2,
                                          date='2001-02-03',
                                          create=False)

        Article.create_articles([a, b, c, d],
                                articleset=s1,
                                check_duplicate=False)
        ES().flush()

        # counts per mediumid
        self.assertEqual(
            dict(ES().aggregate_query(filters=dict(sets=s1.id),
                                      group_by="mediumid")), {
                                          m1.id: 1,
                                          m2.id: 3
                                      })

        # counts per medium (name)
        self.assertEqual(
            dict(ES().aggregate_query(filters=dict(sets=s1.id),
                                      group_by="medium")), {
                                          m1.name: 1,
                                          m2.name: 3
                                      })

        self.assertEqual(
            dict(ES().aggregate_query(filters=dict(sets=s1.id),
                                      group_by="date",
                                      date_interval="year")), {
                                          datetime(2001, 1, 1): 3,
                                          datetime(2002, 1, 1): 1
                                      })

        self.assertEqual(
            dict(ES().aggregate_query(filters=dict(sets=s1.id),
                                      group_by="date",
                                      date_interval="month")), {
                                          datetime(2001, 1, 1): 1,
                                          datetime(2002, 1, 1): 1,
                                          datetime(2001, 2, 1): 2
                                      })

        # set statistics
        stats = ES().statistics(filters=dict(sets=s1.id))
        self.assertEqual(stats.n, 4)
        self.assertEqual(stats.start_date, datetime(2001, 1, 1))
        self.assertEqual(stats.end_date, datetime(2002, 1, 1))

        # media list
        self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))),
                         {m1.id, m2.id})