def _do_create_articles(cls, articles, deduplicate=True): """Check duplicates and save the articles to db. Does *not* save to elastic or add to articlesets Assumes that if .parent is given, it has an id (because parent is part of hash) modifies all articles in place with .hash, .id, .uuid, and .duplicate (None or Article) """ es = amcates.ES() uuids = {} # {uuid : article} hashes = collections.defaultdict(list) # {hash: articles} # Iterate over articles, mark duplicates within addendum and build uuids/hashes dictionaries for a in articles: if a.id: raise ValueError("Specifying explicit article ID in save not allowed") a.es_dict = amcates.get_article_dict(a) a.hash = a.es_dict['hash'] if not hasattr(a, 'uuid'): a.uuid = None a.duplicate, a.internal_duplicate = None, None # innocent until proven guilty if not deduplicate: continue if a.uuid: uuid = str(a.uuid) if uuid in uuids: raise ValueError("Duplicate UUID in article upload") uuids[uuid] = a else: # articles with explicit uuid cannot be deduplicated on hash hashes[a.hash].append(a) def _set_dupe(dupe, orig): dupe.duplicate = orig dupe.id = orig.id dupe.uuid = orig.uuid # check dupes based on hash if hashes: results = es.query_all(filters={'hash': hashes.keys()}, fields=["hash", "uuid"], score=False) for orig in results: for dupe in hashes[orig.hash]: _set_dupe(dupe, orig) # check dupes based on uuid (if any are given) if uuids: results = es.query_all(filters={'uuid': uuids.keys()}, fields=["hash", "uuid"], score=False) for orig in results: dupe = uuids[orig.uuid] if dupe.hash != orig.hash: raise ValueError("Cannot modify existing articles: {orig.hash} != {dupe.hash}".format(**locals())) _set_dupe(dupe, orig) # now we can save the articles and set id to_insert = [a for a in articles if not a.duplicate] result = bulk_insert_returning_ids(to_insert) if len(to_insert) == 0: return [] for a, inserted in zip(to_insert, result): a.id = inserted.id return to_insert
def import_codingjobs(self): old_ids, jobs = [], [] for job in self._get_dicts("codingjobs.jsonl"): j = CodingJob(project_id=self.status.project.id, name=job['name'], archived=job['archived'], insertuser_id=self.status.project.owner.id, coder_id=self.status.users[job['coder']], articleset_id=self.status.setids[job['articleset']]) if job['articleschema']: j.articleschema_id = self.status.codingschemas[job['articleschema']] if job['unitschema']: j.unitschema_id = self.status.codingschemas[job['unitschema']] jobs.append(j) old_ids.append(job['pk']) jobs = bulk_insert_returning_ids(jobs) return {old_id: job.id for (old_id, job) in zip(old_ids, jobs)}
def _replace_codings(self, new_codings): # Updating tactic: delete all existing codings and codingvalues, then insert # the new ones. This prevents calculating a delta, and confronting the # database with (potentially) many update queries. CodingValue.objects.filter(coding__coded_article=self).delete() Coding.objects.filter(coded_article=self).delete() new_coding_objects = map(partial(_to_coding, self), new_codings) new_coding_objects = bulk_insert_returning_ids(new_coding_objects) coding_values = itertools.chain.from_iterable( _to_codingvalues(co, c["values"]) for c, co in zip(new_codings, new_coding_objects) ) return (new_coding_objects, CodingValue.objects.bulk_create(coding_values))
def _replace_codings(self, new_codings): # Updating tactic: delete all existing codings and codingvalues, then insert # the new ones. This prevents calculating a delta, and confronting the # database with (potentially) many update queries. CodingValue.objects.filter(coding__coded_article=self).delete() Coding.objects.filter(coded_article=self).delete() new_coding_objects = list(map(partial(_to_coding, self), new_codings)) new_coding_objects = bulk_insert_returning_ids(new_coding_objects) coding_values = list(itertools.chain.from_iterable( _to_codingvalues(co, c["values"]) for c, co in zip(new_codings, new_coding_objects) )) return (new_coding_objects, CodingValue.objects.bulk_create(coding_values))
def test_bulk_insert_returning_ids(self): m1 = Medium(name="test_bi_1") m2 = Medium(name="test_bi_2") self.assertIsNone(m1.id) self.assertIsNone(m2.id) new_objects = bulk_insert_returning_ids([m1, m2]) self.assertIsNone(m1.id) self.assertIsNone(m2.id) self.assertIsNotNone(new_objects[0].id) self.assertIsNotNone(new_objects[1].id) self.assertEqual("test_bi_1", Medium.objects.get(id=new_objects[0].id).name) self.assertEqual("test_bi_2", Medium.objects.get(id=new_objects[1].id).name)
def test_bulk_insert_returning_ids(self): m1 = Language(label="test_bi_1") m2 = Language(label="test_bi_2") self.assertIsNone(m1.id) self.assertIsNone(m2.id) new_objects = bulk_insert_returning_ids([m1, m2]) self.assertIsNone(m1.id) self.assertIsNone(m2.id) self.assertIsNotNone(new_objects[0].id) self.assertIsNotNone(new_objects[1].id) self.assertEqual("test_bi_1", Language.objects.get(id=new_objects[0].id).label) self.assertEqual("test_bi_2", Language.objects.get(id=new_objects[1].id).label)
def test_bulk_insert_returning_ids(self): m1 = Language(label="test_bi_1") m2 = Language(label="test_bi_2") self.assertIsNone(m1.id) self.assertIsNone(m2.id) new_objects = bulk_insert_returning_ids([m1, m2]) self.assertIsNone(m1.id) self.assertIsNone(m2.id) self.assertIsNotNone(new_objects[0].id) self.assertIsNotNone(new_objects[1].id) self.assertEqual("test_bi_1", Language.objects.get(id=new_objects[0].id).label) self.assertEqual("test_bi_2", Language.objects.get(id=new_objects[1].id).label)
def import_sentences(self): sentences = {} # aid -> {par, sent -> sent_id} for i, batch in enumerate(toolkit.splitlist(self._get_dicts("sentences.jsonl"), itemsperbatch=1000)): logging.info("Creating sentences for 1000 articles, batch {i}".format(**locals())) # check existing articles articles = {self.status.articles[d["article_id"]]: d["sentences"] for d in batch} _load_sentences(Sentence.objects.filter(article_id__in=articles) .values_list("article_id", "parnr", "sentnr", "pk") , target=sentences) to_add = list(self.get_sentences(articles, sentences)) if to_add: logging.info("Creating {} sentences".format(len(to_add))) added = ((s.article_id, s.parnr, s.sentnr, s.pk) for s in bulk_insert_returning_ids(to_add, fields=["*"])) _load_sentences(added, target=sentences) sentences.update(added) return sentences
def test_bulk_insert_returning_ids(self): m1 = Medium(name="test_bi_1") m2 = Medium(name="test_bi_2") self.assertIsNone(m1.id) self.assertIsNone(m2.id) new_objects = bulk_insert_returning_ids([m1, m2]) self.assertIsNone(m1.id) self.assertIsNone(m2.id) self.assertIsNotNone(new_objects[0].id) self.assertIsNotNone(new_objects[1].id) self.assertEqual("test_bi_1", Medium.objects.get(id=new_objects[0].id).name) self.assertEqual("test_bi_2", Medium.objects.get(id=new_objects[1].id).name)
def save_trees(cls, trees): """ Saves a list of article trees efficiently to database. @type trees: [ArticleTree] """ #trees = map(copy, trees) for level in count(): level_trees = list(chain.from_iterable(tree.get_level(level) for tree in trees)) if not level_trees: break for tree in level_trees: if tree.parent is None: continue tree.obj.parent = tree.parent.obj articles = bulk_insert_returning_ids(t.obj for t in level_trees) for tree, article in zip(level_trees, articles): tree.obj = article
def save_trees(cls, trees): """ Saves a list of article trees efficiently to database. @type trees: [ArticleTree] """ #trees = map(copy, trees) for level in count(): level_trees = list( chain.from_iterable(tree.get_level(level) for tree in trees)) if not level_trees: break for tree in level_trees: if tree.parent is None: continue tree.obj.parent = tree.parent.obj articles = bulk_insert_returning_ids(t.obj for t in level_trees) for tree, article in zip(level_trees, articles): tree.obj = article
def create_articles(cls, articles, articleset=None, articlesets=None, deduplicate=True, monitor=NullMonitor()): """ Add the given articles to the database, the index, and the given set Duplicates are detected and have ._duplicate and .id set (and are added to sets) @param articles: a collection of objects with the necessary properties (.title etc) @param articleset(s): articleset object(s), specify either or none """ monitor = monitor.submonitor(total=6) if articlesets is None: articlesets = [articleset] if articleset else [] # Check for ids for a in articles: if a.id is not None: raise ValueError( "Specifying explicit article ID in save not allowed") # Compute hashes, mark all articles as non-duplicates for a in articles: a.compute_hash() a._duplicate = None # Determine which articles are dupes of each other, *then* query the database # to check if the database has any articles we just got. if deduplicate: hashes = collections.defaultdict( list) # type: Dict[bytes, List[Article]] for a in articles: if a.hash in hashes: a._duplicate = hashes[a.hash][0] else: hashes[a.hash].append(a) # Check database for duplicates monitor.update(message="Checking _duplicates based on hash..") if hashes: results = Article.objects.filter( hash__in=hashes.keys()).only("hash") for orig in results: dupes = hashes[orig.hash] for dupe in dupes: dupe._duplicate = orig dupe.id = orig.id else: monitor.update() # Save all non-duplicates to_insert = [a for a in articles if not a._duplicate] monitor.update(message="Inserting {} articles into database..".format( len(to_insert))) if to_insert: result = bulk_insert_returning_ids(to_insert) for a, inserted in zip(to_insert, result): a.id = inserted.id dicts = [ a.get_article_dict(sets=[aset.id for aset in articlesets]) for a in to_insert ] amcates.ES().bulk_insert(dicts, batch_size=100, monitor=monitor) else: monitor.update() # At this point we can still have internal duplicates. Give them an ID as well. for article in articles: if article.id is None and article._duplicate is not None: article.id = article._duplicate.id if not articlesets: monitor.update(3) return articles # add new articles and _duplicates to articlesets monitor.update(message="Adding articles to articleset..") new_ids = {a.id for a in to_insert} dupes = {a._duplicate.id for a in articles if a._duplicate} - new_ids for aset in articlesets: if new_ids: aset.add_articles(new_ids, add_to_index=False, monitor=monitor) else: monitor.update() if dupes: aset.add_articles(dupes, add_to_index=True, monitor=monitor) else: monitor.update() # Add to articleset caches properties = set() for article in articles: properties.update(article.properties.keys()) for articleset in articlesets: articleset._add_to_property_cache(properties) return articles