Example #1
0
    def _do_create_articles(cls, articles, deduplicate=True):
        """Check duplicates and save the articles to db.
        Does *not* save to elastic or add to articlesets
        Assumes that if .parent is given, it has an id
        (because parent is part of hash)
        modifies all articles in place with .hash, .id, .uuid, and .duplicate (None or Article)
        """
        es = amcates.ES()

        uuids = {} # {uuid : article}
        hashes = collections.defaultdict(list) # {hash: articles}
        
        # Iterate over articles, mark duplicates within addendum and build uuids/hashes dictionaries
        for a in articles:
            if a.id:
                raise ValueError("Specifying explicit article ID in save not allowed")
            a.es_dict = amcates.get_article_dict(a)
            a.hash = a.es_dict['hash']
            if not hasattr(a, 'uuid'): a.uuid = None
            a.duplicate, a.internal_duplicate = None, None # innocent until proven guilty
            if not deduplicate:
                continue
            if a.uuid:
                uuid = str(a.uuid)
                if uuid in uuids:
                    raise ValueError("Duplicate UUID in article upload")
                uuids[uuid] = a
            else: # articles with explicit uuid cannot be deduplicated on hash
                hashes[a.hash].append(a)
            
        def _set_dupe(dupe, orig):
            dupe.duplicate = orig
            dupe.id = orig.id
            dupe.uuid = orig.uuid
        # check dupes based on hash
        if hashes:
            results = es.query_all(filters={'hash': hashes.keys()},
                                   fields=["hash", "uuid"], score=False)
            for orig in results:
                for dupe in hashes[orig.hash]:
                    _set_dupe(dupe, orig)

        # check dupes based on uuid (if any are given)
        if uuids:
            results = es.query_all(filters={'uuid': uuids.keys()},
                                   fields=["hash", "uuid"], score=False)
            for orig in results:
                dupe = uuids[orig.uuid]
                if dupe.hash != orig.hash:
                    raise ValueError("Cannot modify existing articles: {orig.hash} != {dupe.hash}".format(**locals()))
                _set_dupe(dupe, orig)

        # now we can save the articles and set id
        to_insert = [a for a in articles if not a.duplicate]
        result = bulk_insert_returning_ids(to_insert)
        if len(to_insert) == 0:
            return []
        for a, inserted in zip(to_insert, result):
            a.id = inserted.id
        return to_insert
Example #2
0
 def import_codingjobs(self):
     old_ids, jobs = [], []
     for job in self._get_dicts("codingjobs.jsonl"):
         j = CodingJob(project_id=self.status.project.id, name=job['name'], archived=job['archived'],
                       insertuser_id=self.status.project.owner.id, coder_id=self.status.users[job['coder']],
                       articleset_id=self.status.setids[job['articleset']])
         if job['articleschema']:
             j.articleschema_id = self.status.codingschemas[job['articleschema']]
         if job['unitschema']:
             j.unitschema_id = self.status.codingschemas[job['unitschema']]
         jobs.append(j)
         old_ids.append(job['pk'])
     jobs = bulk_insert_returning_ids(jobs)
     return {old_id: job.id for (old_id, job) in zip(old_ids, jobs)}
Example #3
0
    def _replace_codings(self, new_codings):
        # Updating tactic: delete all existing codings and codingvalues, then insert
        # the new ones. This prevents calculating a delta, and confronting the
        # database with (potentially) many update queries.
        CodingValue.objects.filter(coding__coded_article=self).delete()
        Coding.objects.filter(coded_article=self).delete()

        new_coding_objects = map(partial(_to_coding, self), new_codings)
        new_coding_objects = bulk_insert_returning_ids(new_coding_objects)

        coding_values = itertools.chain.from_iterable(
            _to_codingvalues(co, c["values"]) for c, co in zip(new_codings, new_coding_objects)
        )

        return (new_coding_objects, CodingValue.objects.bulk_create(coding_values))
Example #4
0
    def _replace_codings(self, new_codings):
        # Updating tactic: delete all existing codings and codingvalues, then insert
        # the new ones. This prevents calculating a delta, and confronting the
        # database with (potentially) many update queries.
        CodingValue.objects.filter(coding__coded_article=self).delete()
        Coding.objects.filter(coded_article=self).delete()

        new_coding_objects = list(map(partial(_to_coding, self), new_codings))
        new_coding_objects = bulk_insert_returning_ids(new_coding_objects)

        coding_values = list(itertools.chain.from_iterable(
            _to_codingvalues(co, c["values"]) for c, co in zip(new_codings, new_coding_objects)
        ))

        return (new_coding_objects, CodingValue.objects.bulk_create(coding_values))
Example #5
0
    def test_bulk_insert_returning_ids(self):
        m1 = Medium(name="test_bi_1")
        m2 = Medium(name="test_bi_2")

        self.assertIsNone(m1.id)
        self.assertIsNone(m2.id)

        new_objects = bulk_insert_returning_ids([m1, m2])

        self.assertIsNone(m1.id)
        self.assertIsNone(m2.id)
        self.assertIsNotNone(new_objects[0].id)
        self.assertIsNotNone(new_objects[1].id)

        self.assertEqual("test_bi_1", Medium.objects.get(id=new_objects[0].id).name)
        self.assertEqual("test_bi_2", Medium.objects.get(id=new_objects[1].id).name)
Example #6
0
    def test_bulk_insert_returning_ids(self):
        m1 = Language(label="test_bi_1")
        m2 = Language(label="test_bi_2")

        self.assertIsNone(m1.id)
        self.assertIsNone(m2.id)

        new_objects = bulk_insert_returning_ids([m1, m2])

        self.assertIsNone(m1.id)
        self.assertIsNone(m2.id)
        self.assertIsNotNone(new_objects[0].id)
        self.assertIsNotNone(new_objects[1].id)

        self.assertEqual("test_bi_1", Language.objects.get(id=new_objects[0].id).label)
        self.assertEqual("test_bi_2", Language.objects.get(id=new_objects[1].id).label)
Example #7
0
    def test_bulk_insert_returning_ids(self):
        m1 = Language(label="test_bi_1")
        m2 = Language(label="test_bi_2")

        self.assertIsNone(m1.id)
        self.assertIsNone(m2.id)

        new_objects = bulk_insert_returning_ids([m1, m2])

        self.assertIsNone(m1.id)
        self.assertIsNone(m2.id)
        self.assertIsNotNone(new_objects[0].id)
        self.assertIsNotNone(new_objects[1].id)

        self.assertEqual("test_bi_1", Language.objects.get(id=new_objects[0].id).label)
        self.assertEqual("test_bi_2", Language.objects.get(id=new_objects[1].id).label)
Example #8
0
 def import_sentences(self):
     sentences = {}  # aid -> {par, sent -> sent_id}
     for i, batch in enumerate(toolkit.splitlist(self._get_dicts("sentences.jsonl"), itemsperbatch=1000)):
         logging.info("Creating sentences for 1000 articles, batch {i}".format(**locals()))
         # check existing articles
         articles = {self.status.articles[d["article_id"]]: d["sentences"] for d in batch}
         _load_sentences(Sentence.objects.filter(article_id__in=articles)
                         .values_list("article_id", "parnr", "sentnr", "pk")
                         , target=sentences)
         to_add = list(self.get_sentences(articles, sentences))
         if to_add:
             logging.info("Creating {} sentences".format(len(to_add)))
             added = ((s.article_id, s.parnr, s.sentnr, s.pk) for s in bulk_insert_returning_ids(to_add,
                                                                                                 fields=["*"]))
             _load_sentences(added, target=sentences)
             sentences.update(added)
     return sentences
    def test_bulk_insert_returning_ids(self):
        m1 = Medium(name="test_bi_1")
        m2 = Medium(name="test_bi_2")

        self.assertIsNone(m1.id)
        self.assertIsNone(m2.id)

        new_objects = bulk_insert_returning_ids([m1, m2])

        self.assertIsNone(m1.id)
        self.assertIsNone(m2.id)
        self.assertIsNotNone(new_objects[0].id)
        self.assertIsNotNone(new_objects[1].id)

        self.assertEqual("test_bi_1",
                         Medium.objects.get(id=new_objects[0].id).name)
        self.assertEqual("test_bi_2",
                         Medium.objects.get(id=new_objects[1].id).name)
Example #10
0
    def save_trees(cls, trees):
        """
        Saves a list of article trees efficiently to database.

        @type trees: [ArticleTree]
        """
        #trees = map(copy, trees)

        for level in count():
            level_trees = list(chain.from_iterable(tree.get_level(level) for tree in trees))

            if not level_trees:
                break

            for tree in level_trees:
                if tree.parent is None:
                    continue
                tree.obj.parent = tree.parent.obj

            articles = bulk_insert_returning_ids(t.obj for t in level_trees)
            for tree, article in zip(level_trees, articles):
                tree.obj = article
Example #11
0
    def save_trees(cls, trees):
        """
        Saves a list of article trees efficiently to database.

        @type trees: [ArticleTree]
        """
        #trees = map(copy, trees)

        for level in count():
            level_trees = list(
                chain.from_iterable(tree.get_level(level) for tree in trees))

            if not level_trees:
                break

            for tree in level_trees:
                if tree.parent is None:
                    continue
                tree.obj.parent = tree.parent.obj

            articles = bulk_insert_returning_ids(t.obj for t in level_trees)
            for tree, article in zip(level_trees, articles):
                tree.obj = article
Example #12
0
    def create_articles(cls,
                        articles,
                        articleset=None,
                        articlesets=None,
                        deduplicate=True,
                        monitor=NullMonitor()):
        """
        Add the given articles to the database, the index, and the given set

        Duplicates are detected and have ._duplicate and .id set (and are added to sets)

        @param articles: a collection of objects with the necessary properties (.title etc)
        @param articleset(s): articleset object(s), specify either or none
        """
        monitor = monitor.submonitor(total=6)
        if articlesets is None:
            articlesets = [articleset] if articleset else []

        # Check for ids
        for a in articles:
            if a.id is not None:
                raise ValueError(
                    "Specifying explicit article ID in save not allowed")

        # Compute hashes, mark all articles as non-duplicates
        for a in articles:
            a.compute_hash()
            a._duplicate = None

        # Determine which articles are dupes of each other, *then* query the database
        # to check if the database has any articles we just got.
        if deduplicate:
            hashes = collections.defaultdict(
                list)  # type: Dict[bytes, List[Article]]

            for a in articles:
                if a.hash in hashes:
                    a._duplicate = hashes[a.hash][0]
                else:
                    hashes[a.hash].append(a)

            # Check database for duplicates
            monitor.update(message="Checking _duplicates based on hash..")
            if hashes:
                results = Article.objects.filter(
                    hash__in=hashes.keys()).only("hash")
                for orig in results:
                    dupes = hashes[orig.hash]
                    for dupe in dupes:
                        dupe._duplicate = orig
                        dupe.id = orig.id
        else:
            monitor.update()

        # Save all non-duplicates
        to_insert = [a for a in articles if not a._duplicate]
        monitor.update(message="Inserting {} articles into database..".format(
            len(to_insert)))
        if to_insert:
            result = bulk_insert_returning_ids(to_insert)
            for a, inserted in zip(to_insert, result):
                a.id = inserted.id
            dicts = [
                a.get_article_dict(sets=[aset.id for aset in articlesets])
                for a in to_insert
            ]
            amcates.ES().bulk_insert(dicts, batch_size=100, monitor=monitor)
        else:
            monitor.update()

        # At this point we can still have internal duplicates. Give them an ID as well.
        for article in articles:
            if article.id is None and article._duplicate is not None:
                article.id = article._duplicate.id

        if not articlesets:
            monitor.update(3)
            return articles

        # add new articles and _duplicates to articlesets
        monitor.update(message="Adding articles to articleset..")
        new_ids = {a.id for a in to_insert}
        dupes = {a._duplicate.id for a in articles if a._duplicate} - new_ids
        for aset in articlesets:
            if new_ids:
                aset.add_articles(new_ids, add_to_index=False, monitor=monitor)
            else:
                monitor.update()

            if dupes:
                aset.add_articles(dupes, add_to_index=True, monitor=monitor)
            else:
                monitor.update()

        # Add to articleset caches
        properties = set()
        for article in articles:
            properties.update(article.properties.keys())

        for articleset in articlesets:
            articleset._add_to_property_cache(properties)

        return articles