Example #1
0
    def test_post_multiple(self):

        a1, a2 = [test_article() for _ in [1, 2]]

        result = self._post_articles([a1, a2])
        self.assertEqual(2, len(result))
        self.assertEqual(set(result[0].keys()),
                         {'id'})  # POST should only return IDs

        arts = self._get_articles()['results']
        self.assertEqual({a['title']
                          for a in arts}, {a1['title'], a2['title']})
        self.assertNotIn("text", arts[0].keys())

        arts = self._get_articles(text=True)['results']
        self.assertEqual({a['text'] for a in arts}, {a1['text'], a2['text']})

        arts = [Article.objects.get(pk=a["id"]) for a in result]
        self.assertEqual(arts[0].title, a1['title'])
        self.assertEqual(arts[1].title, a2['title'])

        # Are the articles added to the index?
        amcates.ES().refresh()
        self.assertEqual(
            len(set(amcates.ES().query_ids(filters={"sets": self.aset.id}))),
            2)
Example #2
0
    def test_create(self):
        s = amcattest.create_test_set()

        # is the set empty? (aka can we get the results)
        url = ArticleViewSet.get_url(project=s.project.id, articleset=s.id)
        result = self.get(url)
        self.assertEqual(result['results'], [])

        body = {
            'text': 'bla',
            'headline': 'headline',
            'date': '2013-01-01T00:00:00',
            'medium': 'test_medium'
        }

        result = self.post(url, body, as_user=s.project.owner)
        if isinstance(result, list): result, = result
        self.assertEqual(result['headline'], body['headline'])

        result = self.get(url)
        self.assertEqual(len(result['results']), 1)
        a = result['results'][0]
        self.assertEqual(a['headline'], body['headline'])
        self.assertEqual(a['project'], s.project_id)
        self.assertEqual(a['length'], 2)

        # Is the result added to the elastic index as well?
        from amcat.tools import amcates
        amcates.ES().flush()
        r = list(amcates.ES().query(filters=dict(sets=s.id),
                                    fields=["text", "headline", 'medium']))
        self.assertEqual(len(r), 1)
        self.assertEqual(r[0].medium, "test_medium")
        self.assertEqual(r[0].headline, "headline")
Example #3
0
 def test_create(self):
     """Can we create/store/index an article object?"""
     a = amcattest.create_test_article(create=False, date='2010-12-31', headline=u'\ua000abcd\u07b4')
     Article.create_articles([a], create_id=True)
     db_a = Article.objects.get(pk=a.id)
     amcates.ES().flush()
     es_a = list(amcates.ES().query(filters={'ids': [a.id]}, fields=["date", "headline"]))[0]
     self.assertEqual(a.headline, db_a.headline)
     self.assertEqual(a.headline, es_a.headline)
     self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat())
     self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
Example #4
0
    def test_post_properties(self):
        a = test_article(foo='bar')
        res = self._post_articles(a)

        self.assertEqual(set(amcates.ES().query_ids(filters={"foo": "bar"})),
                         {res["id"]})

        doc = amcates.ES().get(id=res['id'])
        self.assertEqual(doc['foo'], 'bar')

        db = self._get_article(aid=res['id'])
        self.assertEqual(db['properties']['foo'], 'bar')
Example #5
0
    def hash_articles(cls, articleset: ArticleSet,
                      ignore_fields: set) -> Iterable[Tuple[int, str]]:
        """
        Finds all articles in an articleset, and hashes articles as a tuple of field values, ordered alphabetically
        by field name. Fields in ignore_fields will not affect the hash.
        Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the
        values of thoses fields are equal in both articles.

        @param articleset       The articleset that is to be searched
        @param ignore_fields    A set of fields that should not be included in the calculated hashes

        @return                 An iterable of (<article_id>, <hash>) tuples.
        """
        all_fields = STATIC_FIELDS + list(articleset.get_used_properties())

        if not ignore_fields:
            fields = ["hash"]
        else:
            fields = sorted(f for f in all_fields if not f in ignore_fields)

        x = amcates.ES().scan(query={
            "query": {
                "constant_score": {
                    "filter": {
                        "term": {
                            "sets": articleset.id
                        }
                    }
                }
            }
        },
                              fields=fields)
        for x in amcates.ES().scan(query={
                "query": {
                    "constant_score": {
                        "filter": {
                            "term": {
                                "sets": articleset.id
                            }
                        }
                    }
                }
        },
                                   fields=fields):
            if not ignore_fields:
                yield int(x['_id']), x['fields']['hash'][0]
                continue
            art_tuple = tuple(
                str(x['fields'].get(k, [None])[0]) for k in fields)
            hash = hash_class(repr(art_tuple).encode()).hexdigest()
            yield int(x['_id']), hash
Example #6
0
    def handle(self, *args, **options):
        es = amcates.ES()

        print("Counting articles..", end=" ")
        sys.stdout.flush()
        narticles = es.count(query="*", filters={})
        print(narticles)

        then, now = datetime.datetime.now(), datetime.datetime.now()
        for i, article_ids in enumerate(grouper(es.query_ids(), n=GROUP_SIZE)):
            progress = (float(i * GROUP_SIZE) / float(narticles)) * 100
            print("{} of {} ({:.2f}%)".format(i * GROUP_SIZE, narticles,
                                              progress))

            articles = Article.objects.filter(
                id__in=article_ids).select_related("medium")
            article_dicts = map(get_article_dict, articles)

            for article_dict in article_dicts:
                del article_dict["sets"]
                del article_dict["hash"]

            es.bulk_update_values({a["id"]: a for a in article_dicts})

            then, now = now, datetime.datetime.now()
            print("Articles per second: ", end="")
            print(int(GROUP_SIZE / (now - then).total_seconds()))

        print("Done.")
Example #7
0
    def cache_results(self):
        """
        Cache results for favourite projects and articles per set
        """
        # Only cache once! (is this a hack?)
        self.cache_results = lambda: None

        try:
            # HACK!
            project = self.context['request'].GET['project_for_favourites']
        except KeyError:
            # no project given, so nothing to do :-(
            self.fav_articlesets = None
            self.nn = None
            return

        self.fav_articlesets = set(
            ArticleSet.objects.filter(
                favourite_of_projects=project).values_list("id", flat=True))
        sets = list(
            ArticleSet.objects.filter(
                Q(project=project) | Q(projects_set=project)).values_list(
                    "id", flat=True))
        self.nn = dict(amcates.ES().aggregate_query(filters={'sets': sets},
                                                    group_by='sets'))

        self._cached = True
Example #8
0
    def handle(self, *args, **options):
        es = amcates.ES()

        print("Counting articles..", end=" ")
        sys.stdout.flush()
        narticles = es.count(query="*", filters={})
        print(narticles)

        then, now = datetime.datetime.now(), datetime.datetime.now()
        for i, article_ids in enumerate(grouper(es.query_ids(), n=GROUP_SIZE)):
            progress = (float(i * GROUP_SIZE) / float(narticles)) * 100
            print("{} of {} ({:.2f}%)".format(i * GROUP_SIZE, narticles,
                                              progress))

            es_articles = es.query_all(filters={"ids": article_ids},
                                       fields=HASH_FIELDS)
            es.bulk_update_values(
                {a.id: {
                    "hash": _get_hash(a.to_dict())
                }
                 for a in es_articles})

            then, now = now, datetime.datetime.now()
            print("Articles per second: ", end="")
            print(int(GROUP_SIZE / (now - then).total_seconds()))

        print("Done.")
Example #9
0
    def test_post(self):
        """Test whether posting and retrieving an article works correctly"""
        self.set_up()

        p = amcattest.create_test_project(owner=self.user)
        s = amcattest.create_test_set(project=p)
        a = {
            'date': datetime.datetime.now().isoformat(),
            'headline': 'Test child',
            'medium': 'Fantasy',
            'text': 'Hello Universe',
            'pagenr': 1,
            'url': 'http://example.org',
            'uuid': 'c691fadf-3c45-4ed6-93fe-f035b5f500af',
        }

        url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format(
            **locals())
        self.post(url, a, self.user)
        amcates.ES().flush()

        res = self.get(url)["results"]
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0]["headline"], a['headline'])
        self.assertEqual(toolkit.readDate(res[0]["date"]),
                         toolkit.readDate(a['date']))
        self.assertEqual(res[0]["uuid"], a['uuid'])
Example #10
0
    def remove_articles(self,
                        articles,
                        remove_from_index=True,
                        monitor=NullMonitor()):
        """
        Remove article from this articleset. Also removes CodedArticles (from codingjobs) and updates
        index if `remove_from_index` is True.

        @param articles: articles to be removed
        @type articles: iterable with indexing of integers or Article objects

        @param remove_from_index: notify elasticsearch of changes
        @type remove_from_index: bool
        """
        monitor = monitor.submonitor(4)
        to_remove = {(art if type(art) is int else art.id) for art in articles}

        monitor.update(message="Deleting articles from database")
        ArticleSetArticle.objects.filter(articleset=self,
                                         article__in=articles).delete()

        monitor.update(message="Deleting coded articles from database")
        CodedArticle.objects.filter(codingjob__articleset=self,
                                    article__in=articles).delete()

        if remove_from_index:
            monitor.update(message="Deleting from index")
            amcates.ES().remove_from_set(self.id, to_remove)
        else:
            monitor.update()

        monitor.update(message="Deleting from cache")
        self._reset_property_cache()
Example #11
0
 def test_dates(self):
     """Test whether date deserialization works, see #66"""
     for d in ('2001-01-01', '1992-12-31T23:59', '2012-02-29T12:34:56.789', datetime.datetime.now()):
         a = amcattest.create_test_article(date=d)
         amcates.ES().flush()
         res = self.get("/api/v4/search", ids=a.id)
         self.assertEqual(toolkit.readDate(res['results'][0]['date']), toolkit.readDate(str(d)))
Example #12
0
    def get_hashes(self):
        fields = [
            f for f in FIELDS if not self.options.get("skip_{}".format(f))
        ]
        if fields == FIELDS:
            fields = ["hash"]
        setid = self.options['articleset'].id
        for x in amcates.ES().scan(query={
                "query": {
                    "constant_score": {
                        "filter": {
                            "term": {
                                "sets": setid
                            }
                        }
                    }
                }
        },
                                   fields=fields):
            if fields == ["hash"]:
                hash = x['fields']['hash'][0]
            else:

                def get(flds, f):
                    val = flds.get(f)
                    return val[0] if val is not None else val

                d = {f: get(x['fields'], f) for f in fields}
                hash = hash_class(json.dumps(d)).hexdigest()
            yield int(x['_id']), hash
Example #13
0
    def test_parents(self):
        """Test parents via nesting"""

        s = amcattest.create_test_set()
        url = ArticleViewSet.get_url(project=s.project.id, articleset=s.id)
        base = {
            'text': 'bla',
            'headline': 'headline',
            'date': '2013-01-01T00:00:00',
            'medium': 'test_medium'
        }

        child1 = dict(base, headline='c1')
        child2 = dict(base, headline='c2')
        parent = dict(base, headline='parent')

        body = dict(parent, children=json.dumps([child1, child2]))
        self.post(url, body, as_user=s.project.owner)
        amcates.ES().flush()

        # result should have 3 articles, with c1 and c2 .parent set to parent
        result = {a['headline']: a for a in self.get(url)['results']}
        self.assertEqual(len(result), 3)
        self.assertEqual(result['c1']['parent'], result['parent']['id'])
        self.assertEqual(result['c2']['parent'], result['parent']['id'])
        self.assertEqual(result['parent']['parent'], None)
Example #14
0
    def test_children(self):
        p = amcattest.create_test_project()
        s = amcattest.create_test_set(project=p)
        # need to json dump the children because the django client does weird stuff with post data
        children = json.dumps([{
            'date': '2001-01-02',
            'headline': 'Test child',
            'medium': 'Fantasy',
            'text': 'Hello Universe'
        }])
        a = {
            'date': '2001-01-01',
            'headline': 'Test parent',
            'medium': 'My Imagination',
            'text': 'Hello World',
            'children': children
        }
        url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format(
            **locals())
        self.post(url, a, as_user=self.user)
        amcates.ES().flush()

        res = self.get(url)["results"]

        headlines = {a['headline']: a for a in res}
        self.assertEqual(set(headlines), {'Test parent', 'Test child'})
        self.assertEqual(headlines['Test child']['parent'],
                         headlines['Test parent']['id'])
Example #15
0
    def test_parents_multiple(self):
        """Can we add multiple objects with children?"""
        s = amcattest.create_test_set()
        url = ArticleViewSet.get_url(project=s.project.id, articleset=s.id)
        base = {
            'text': 'bla',
            'headline': 'headline',
            'date': '2013-01-01T00:00:00',
            'medium': 'test_medium'
        }

        child = dict(base, headline='c')
        parent = dict(base, headline='p')
        leaf = dict(base, headline='l')

        body = json.dumps([leaf, dict(parent, children=[child])])
        self.post(url,
                  body,
                  as_user=s.project.owner,
                  request_options=dict(content_type='application/json'))
        amcates.ES().flush()

        result = {a['headline']: a for a in self.get(url)['results']}
        self.assertEqual(len(result), 3)
        self.assertEqual(result['c']['parent'], result['p']['id'])
        self.assertEqual(result['p']['parent'], None)
        self.assertEqual(result['l']['parent'], None)
Example #16
0
    def _post_articles(self,
                       data,
                       expected_status=201,
                       as_user="******",
                       return_json=None,
                       **url_kwargs):
        if as_user == "self.user": as_user = self.user
        if return_json is None:
            return_json = (expected_status // 100) == 2
        if as_user:
            self.client.login(username=as_user.username, password="******")
        else:
            self.client.logout()
        url = self.url_set(**url_kwargs)
        response = self.client.post(url,
                                    content_type="application/json",
                                    data=json.dumps(data))
        self.assertEqual(
            response.status_code, expected_status,
            "Status code {response.status_code}: {response.content}".format(
                **locals()))

        amcates.ES().refresh()
        if return_json:
            return json.loads(response.content.decode(response.charset))
        else:
            return response
Example #17
0
 def test_dupe(self):
     """Test whether deduplication works"""
     title = 'testartikel'
     a = test_article(title=title)
     aid1 = self._post_articles(a)['id']
     self.setUp_set()
     aid2 = self._post_articles(a)['id']
     amcates.ES().refresh()
     # are the resulting ids identical?
     self.assertEqual(aid1, aid2)
     # is it added to elastic for this set?
     self.assertEqual(
         set(amcates.ES().query_ids(filters={'sets': self.aset.id})),
         {aid1})
     # is it not added (ie we only have one article with this title)
     self.assertEqual(
         set(amcates.ES().query_ids(filters={'title': a['title']})), {aid1})
Example #18
0
 def get_nn(self):
     view = self.context["view"]
     if hasattr(view, 'object_list'):
         sets = list(view.object_list.values_list("id", flat=True))
     else:
         sets = [view.object.id]
     return dict(amcates.ES().aggregate_query(filters={'sets': sets},
                                              group_by='sets'))
Example #19
0
    def delete(self, purge_orphans=True):
        "Delete the articleset and all articles from index and db"
        # which articles are only in this set?
        # check per N articles
        for aids in toolkit.splitlist(
                self.articles.values_list("pk", flat=True)):
            x = set(
                ArticleSetArticle.objects.filter(article_id__in=aids).exclude(
                    articleset=self).values_list("article_id", flat=True))
            todelete = set(aids) - x
            Article.objects.filter(pk__in=todelete).delete()
            amcates.ES().remove_from_set(self.id, aids)

        if purge_orphans:
            amcates.ES().purge_orphans()

        super(ArticleSet,
              self).delete()  # cascade deletes all article references
Example #20
0
def initialize(sender, **kwargs):
    """
    Initialize the amcat database by loading data, creating the admin account, and upgrading the db if needed
    """
    datafile = os.path.join(os.path.dirname(amcat.models.__file__),
                            "_initial_data.json")
    Command().run_from_argv(["manage", "loaddata", datafile])
    create_admin()
    amcates.ES().check_index()
Example #21
0
 def remove_articles(self, articles, remove_from_index=True):
     """
     Add the given articles to this article set
     If refresh or deduplicate are True, schedule a new celery task to do this
     """
     ArticleSetArticle.objects.filter(articleset=self, article__in=articles).delete()
     
     if remove_from_index:
         to_remove = {(art if type(art) is int else art.id) for art in articles}
         amcates.ES().remove_from_set(self.id, to_remove)
Example #22
0
def get_adhoc_result(analysis, text, store_intermediate=True):
    from xtas.tasks.es import adhoc_document
    from xtas.tasks.pipeline import pipeline

    analysis = _get_analysis(analysis)
    es = amcates.ES()
    doc = adhoc_document('adhoc', es.doc_type, 'text', text=text)

    print "Pipelining..."
    return pipeline(doc, analysis, store_intermediate=store_intermediate)
Example #23
0
    def test_deduplication(self):
        """Does deduplication work as it is supposed to?"""

        # create dummy articles to have something in the db 
        [amcattest.create_test_article() for i in range(10)]
        amcates.ES().refresh()
        
        art = dict(project=amcattest.create_test_project(),
                   title="deduptest", text="test", date='2001-01-01')

        a1 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # duplicate articles should not be added
        a2 = amcattest.create_test_article(**art)
        amcates.ES().refresh()
        self.assertEqual(a2.id, a1.id)
        self.assertTrue(a2._duplicate)
        self.assertEqual(_q(title='deduptest'), {a1.id})

        # however, if an articleset is given the 'existing' article
        # should be added to that set
        s1 = amcattest.create_test_set()
        a3 = amcattest.create_test_article(articleset=s1, **art)
        amcates.ES().refresh()
        self.assertEqual(a3.id, a1.id)
        self.assertEqual(_q(title='deduptest'), {a1.id})
        self.assertEqual(set(s1.get_article_ids()), {a1.id})
        self.assertEqual(_q(sets=s1.id), {a1.id})

        # if an existing hash is set, it should be correct
        art2 = dict(hash=b'hash', **art)
        self.assertRaises(ValueError, amcattest.create_test_article, **art2)

        #TODO! Check duplicates within new articles
        art['title'] = "internaldupe"
        a1, a2 = (Article(**art), Article(**art))
        Article.create_articles([a1, a2], articleset=s1)
        self.assertEqual(a1.id, a2.id)
        self.assertEqual(len(_q(title='internaldupe')), 1)
Example #24
0
    def delete(self, purge_orphans=True):
        "Delete the articleset and all articles from index and db"
        # which articles are only in this set?
        # check per N articles

        log.warn("Getting all articles")

        aids = list(self.articles.values_list("pk", flat=True))
        todelete = set(aids)
        log.warn("Finding orphans in {} articles".format(len(aids)))
        for aids in toolkit.splitlist(aids, itemsperbatch=1000):
            x = set(
                ArticleSetArticle.objects.filter(article_id__in=aids).exclude(
                    articleset=self).values_list("article_id", flat=True))
            todelete -= x
        log.warn("Removing {} orphans from DB".format(len(todelete)))
        #Article.objects.filter(pk__in=todelete).delete()
        for i, aids in enumerate(
                toolkit.splitlist(todelete, itemsperbatch=10000)):
            if i > 1:
                log.warn("... batch {i} (x10k)".format(**locals()))
            #Article.objects.filter(pk__in=aids)._raw_delete(Article.objects.db)
            Article.objects.filter(pk__in=aids).only("pk").delete()

        log.warn("Getting set membership from elastic")
        esaids = list(self.get_article_ids_from_elastic())
        if esaids:
            log.warn(
                "Removing set membership from elastic ({} articles)".format(
                    len(esaids)))
            amcates.ES().remove_from_set(self.id, esaids)

        if purge_orphans:
            amcates.ES().refresh()
            amcates.ES().purge_orphans()

        log.warn("Deleting set (and articlesetarticle references)")
        super(ArticleSet,
              self).delete()  # cascade deletes all article references
        log.warn("Done!")
Example #25
0
 def __init__(self,
              user=None,
              queries=None,
              filters=None,
              fields=None,
              hits=False):
     self.user = user
     self.queries = queries
     self.filters = filters or {}
     self.fields = [f for f in (fields or []) if f != "id"]
     self.es = amcates.ES()
     self.hits = hits
     self._count = None
Example #26
0
    def test_post_id(self):
        a = amcattest.create_test_article()
        result = self._post_articles({"id": a.id})
        self.assertEqual(
            set(amcates.ES().query_ids(filters={"sets": self.aset.id})),
            {a.id})

        a2 = amcattest.create_test_article()
        result = self._post_articles([{"id": a.id}, {"id": a2.id}])
        self.assertEqual(
            set(amcates.ES().query_ids(filters={"sets": self.aset.id})),
            {a.id, a2.id})

        # does it also work if we just post the ids?
        self.setUp_set()
        result = self._post_articles(a.id)
        self.assertEqual(
            set(amcates.ES().query_ids(filters={"sets": self.aset.id})),
            {a.id})
        result = self._post_articles([a.id, a2.id])
        self.assertEqual(
            set(amcates.ES().query_ids(filters={"sets": self.aset.id})),
            {a.id, a2.id})
Example #27
0
    def inner(*args, **kargs):
        from amcat.tools import amcates

        amcates._KNOWN_PROPERTIES = None

        es = amcates.ES()
        if not es.es.ping():
            raise unittest.SkipTest("ES not enabled")
        es.delete_index()
        es.refresh()
        es.check_index()
        es.refresh()

        return func(*args, **kargs)
Example #28
0
def get_result(article, analysis, store_intermediate=True, block=True):
    from xtas.tasks.pipeline import pipeline
    if not isinstance(article, int): article = article.id
    analysis = _get_analysis(analysis)

    es = amcates.ES()
    doc = {
        'index': es.index,
        'type': es.doc_type,
        'id': article,
        'field': 'text'
    }
    r = pipeline(doc, analysis, store_intermediate=store_intermediate)
    return r
Example #29
0
    def to_representation(self, aid):
        fields = ["title", "text"]

        def sort_key(token):
            field, offset, term = token
            return fields.index(field), offset

        tokens = amcates.ES().get_tokens(aid, fields)
        for (field, position, term) in sorted(tokens, key=sort_key):
            yield {
                "id": aid,
                "field": field,
                "position": position,
                "word": term
            }
Example #30
0
    def add_articles(self,
                     articles,
                     add_to_index=True,
                     monitor=ProgressMonitor()):
        """
        Add the given articles to this articleset. Implementation is exists of three parts:

          1. Adding ArticleSetArticle objects
          2. Adding CodedArticle objects
          3. Updating index

        @param articles: articles to be removed
        @type articles: iterable with indexing of integers or Article objects

        @param add_to_index: notify elasticsearch of changes
        @type add_to_index: bool
        """
        articles = {(art if type(art) is int else art.id) for art in articles}
        to_add = articles - self.get_article_ids()
        # Only use articles that exist
        to_add = list(Article.exists(to_add))

        monitor.update(10,
                       "{n} articles need to be added".format(n=len(to_add)))
        ArticleSetArticle.objects.bulk_create(
            [
                ArticleSetArticle(articleset=self, article_id=artid)
                for artid in to_add
            ],
            batch_size=100,
        )

        monitor.update(
            20,
            "{n} articleset articles added to database, adding to codingjobs".
            format(n=len(to_add)))
        cjarts = [
            CodedArticle(codingjob=c, article_id=a)
            for c, a in itertools.product(self.codingjob_set.all(), to_add)
        ]
        CodedArticle.objects.bulk_create(cjarts)

        monitor.update(
            30, "{n} articles added to codingjobs, adding to index".format(
                n=len(cjarts)))
        if add_to_index:
            amcates.ES().add_to_set(self.id, to_add, monitor=monitor)