def delete(self, purge_orphans=True): "Delete the articleset and all articles from index and db" # which articles are only in this set? # check per N articles log.warn("Getting all articles") aids = list(self.articles.values_list("pk", flat=True)) todelete = set(aids) log.warn("Finding orphans in {} articles".format(len(aids))) for aids in toolkit.splitlist(aids, itemsperbatch=1000): x = set(ArticleSetArticle.objects.filter(article_id__in=aids).exclude(articleset=self) .values_list("article_id", flat=True)) todelete -= x log.warn("Removing {} orphans from DB".format(len(todelete))) #Article.objects.filter(pk__in=todelete).delete() for i, aids in enumerate(toolkit.splitlist(todelete, itemsperbatch=10000)): if i > 1: log.warn("... batch {i} (x10k)".format(**locals())) #Article.objects.filter(pk__in=aids)._raw_delete(Article.objects.db) Article.objects.filter(pk__in=aids).only("pk").delete() log.warn("Getting set membership from elastic") esaids = list(self.get_article_ids_from_elastic()) if esaids: log.warn("Removing set membership from elastic ({} articles)".format(len(esaids))) amcates.ES().remove_from_set(self.id, esaids) if purge_orphans: amcates.ES().refresh() amcates.ES().purge_orphans() log.warn("Deleting set (and articlesetarticle references)") super(ArticleSet, self).delete() # cascade deletes all article references log.warn("Done!")
def run(self, _input): projects, sets = list(self.options["projects"]), list( self.options["sets"]) if self.options["include_project_sets"]: sets += list(ArticleSet.objects.filter(project__in=projects)) log.info( "Cleaning projects {projects}, sets {sets}".format(**locals())) articles = set() if projects: q = Article.objects.filter(project__in=projects) articles |= set(aid for (aid, ) in q.values_list("id")) if sets: q = ArticleSetArticle.objects.filter(articleset__in=sets) articles |= set(aid for (aid, ) in q.values_list("article_id")) if self.options["batch"]: log.info("Cleaning {n} articles in {m} batch(es) of {b}".format( n=len(articles), b=self.options["batch"], m=1 + len(articles) // self.options["batch"])) for i, articles in enumerate( splitlist(articles, self.options["batch"])): log.info("Batch {i}: Cleaning {n} articles".format( n=len(articles), **locals())) Solr().add_articles(articles) else: log.info("Cleaning {n} articles".format(n=len(articles))) Solr().add_articles(articles) log.info("Done!")
def exists(cls, article_ids, batch_size=500): """ Filters the given articleids to remove non-existing ids """ for batch in splitlist(article_ids, itemsperbatch=batch_size): for aid in Article.objects.filter(pk__in=batch).values_list("pk", flat=True): yield aid
def exists(cls, article_ids, batch_size=500): """ Filters the given articleids to remove non-existing ids """ for batch in splitlist(article_ids, itemsperbatch=batch_size): for aid in Article.objects.filter(pk__in=batch).values_list( "pk", flat=True): yield aid
def add_to_set(self, setid, article_ids): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return for batch in splitlist(article_ids, itemsperbatch=1000): self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid})
def remove_from_set(self, setid, article_ids, flush=True): """Remove the given articles from the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return for batch in splitlist(article_ids, itemsperbatch=1000): self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set': setid})
def test_splitlist(self): for input, output, itemsperbatch in ( ([1,2,3], [[1,2], [3]], 2), ([1,2,3], [[1,2, 3]], 20), ((1,2,3), [(1,2), (3,)], 2), ((i for i in (1,2,3)), [[1,2],[3]], 2), ): o = toolkit.splitlist(input, itemsperbatch) self.assertEqual(list(o), output)
def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return batches = list(splitlist(article_ids, itemsperbatch=1000)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(40/nbatches, "Added batch {i}/{nbatches}".format(**locals())) self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set' : setid})
def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return batches = list(splitlist(article_ids, itemsperbatch=1000)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(40/nbatches, "Added batch {iplus}/{nbatches}".format(iplus=i+1, **locals())) self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set' : setid})
def test_splitlist(self): for input, output, itemsperbatch in ( ([1, 2, 3], [[1, 2], [3]], 2), ([1, 2, 3], [[1, 2, 3]], 20), ((1, 2, 3), [(1, 2), (3, )], 2), ((i for i in (1, 2, 3)), [[1, 2], [3]], 2), ): o = toolkit.splitlist(input, itemsperbatch) self.assertEqual(list(o), output)
def _check_uuids(self, uuids): """Check which articles are already present in the destination database Returns a sequence of articles ids that are NOT present, ie that need to be copied""" for i, batch in enumerate(splitlist(uuids.keys(), itemsperbatch=10000)): log.info("({i}) Checking whether {n} uuids are present in destination database" .format(n=len(batch), **locals())) present = {uuid for (uuid,) in Article.objects.filter(uuid__in=batch).values_list("uuid")} for uuid in set(batch) - present: yield uuids[uuid]
def test_splitlist(self): def plusone(l): for i,e in enumerate(l): l[i] = e+1 for input, output, itemsperbatch in ( ([1,2,3], [[1,2], [3]], 2), ([1,2,3], [[1,2, 3]], 20), ((1,2,3), [(1,2), (3,)], 2), ((i for i in (1,2,3)), [[1,2],[3]], 2), ): o = toolkit.splitlist(input, itemsperbatch) self.assertEqual(list(o), output)
def delete(self, purge_orphans=True): "Delete the articleset and all articles from index and db" # which articles are only in this set? # check per N articles log.warn("Getting all articles") aids = list(self.articles.values_list("pk", flat=True)) todelete = set(aids) log.warn("Finding orphans in {} articles".format(len(aids))) for aids in toolkit.splitlist(aids, itemsperbatch=1000): x = set( ArticleSetArticle.objects.filter(article_id__in=aids).exclude( articleset=self).values_list("article_id", flat=True)) todelete -= x log.warn("Removing {} orphans from DB".format(len(todelete))) #Article.objects.filter(pk__in=todelete).delete() for i, aids in enumerate( toolkit.splitlist(todelete, itemsperbatch=10000)): if i > 1: log.warn("... batch {i} (x10k)".format(**locals())) #Article.objects.filter(pk__in=aids)._raw_delete(Article.objects.db) Article.objects.filter(pk__in=aids).only("pk").delete() log.warn("Getting set membership from elastic") esaids = list(self.get_article_ids_from_elastic()) if esaids: log.warn( "Removing set membership from elastic ({} articles)".format( len(esaids))) amcates.ES().remove_from_set(self.id, esaids) if purge_orphans: amcates.ES().refresh() amcates.ES().purge_orphans() log.warn("Deleting set (and articlesetarticle references)") super(ArticleSet, self).delete() # cascade deletes all article references log.warn("Done!")
def in_index(self, ids): """ Check whether the given ids are already indexed. @return: a sequence of ids that are in the index """ if not isinstance(ids, list): ids = list(ids) log.info("Checking existence of {nids} documents".format(nids=len(ids))) if not ids: return for batch in splitlist(ids, itemsperbatch=10000): result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body={"ids": batch}, fields=[]) for doc in result['docs']: if doc['found']: yield int(doc['_id'])
def test_splitlist(self): def plusone(l): for i, e in enumerate(l): l[i] = e + 1 for input, output, itemsperbatch in ( ([1, 2, 3], [[1, 2], [3]], 2), ([1, 2, 3], [[1, 2, 3]], 20), ((1, 2, 3), [(1, 2), (3, )], 2), ((i for i in (1, 2, 3)), [[1, 2], [3]], 2), ): o = toolkit.splitlist(input, itemsperbatch) self.assertEqual(list(o), output)
def _create_codingjob_batches(codingjob, article_ids, batch_size): name = codingjob.name for i, batch in enumerate(splitlist(article_ids, batch_size)): codingjob.pk = None codingjob.name = "{name} - {i}".format(i=i + 1, name=name) codingjob.articleset = ArticleSet.create_set( project=codingjob.project, name=codingjob.name, favourite=False, articles=batch, ) codingjob.save() yield codingjob.pk
def add_articles(self, article_ids, batch_size = 1000): """ Add the given article_ids to the index. This is done in batches, so there is no limit on the length of article_ids (which can be a generator). """ if not article_ids: return from amcat.models import Article, ArticleSetArticle n = len(article_ids) / batch_size for i, batch in enumerate(splitlist(article_ids, itemsperbatch=batch_size)): log.info("Adding batch {i}/{n}".format(**locals())) all_sets = multidict((aa.article_id, aa.articleset_id) for aa in ArticleSetArticle.objects.filter(article__in=batch)) dicts = (get_article_dict(article, list(all_sets.get(article.id, []))) for article in Article.objects.filter(pk__in=batch)) self.bulk_insert(dicts)
def delete(self, purge_orphans=True): "Delete the articleset and all articles from index and db" # which articles are only in this set? # check per N articles for aids in toolkit.splitlist(self.articles.values_list("pk", flat=True)): x = set(ArticleSetArticle.objects.filter(article_id__in=aids).exclude(articleset=self) .values_list("article_id", flat=True)) todelete = set(aids) - x Article.objects.filter(pk__in=todelete).delete() amcates.ES().remove_from_set(self.id, aids) if purge_orphans: amcates.ES().purge_orphans() super(ArticleSet, self).delete() # cascade deletes all article references
def _create_codingjob_batches(codingjob, article_ids, batch_size): name = codingjob.name for i, batch in enumerate(splitlist(article_ids, batch_size)): codingjob.pk = None codingjob.name = "{name} - {i}".format(i=i+1, name=name) codingjob.articleset = ArticleSet.create_set( project=codingjob.project, name=codingjob.name, favourite=False, articles=batch, ) codingjob.save() yield codingjob.pk
def add_articles(self, article_ids, batch_size=1000): """ Add the given article_ids to the index. This is done in batches, so there is no limit on the length of article_ids (which can be a generator). """ if not article_ids: return from amcat.models import Article, ArticleSetArticle n = len(article_ids) / batch_size for i, batch in enumerate(splitlist(article_ids, itemsperbatch=batch_size)): log.info("Adding batch {i}/{n}".format(**locals())) all_sets = multidict((aa.article_id, aa.articleset_id) for aa in ArticleSetArticle.objects.filter(article__in=batch)) dicts = (get_article_dict(article, list(all_sets.get(article.id, []))) for article in Article.objects.filter(pk__in=batch)) self.bulk_insert(dicts)
def test_splitlist(self): seq = [1, 2, 3, 4] # Test standard cases self.assertEqual(list(splitlist([], 10)), []) self.assertEqual(list(splitlist(seq, 1)), [[1], [2], [3], [4]]) self.assertEqual(list(splitlist(seq, 2)), [[1, 2], [3, 4]]) self.assertEqual(list(splitlist(seq, 3)), [[1, 2, 3], [4]]) self.assertEqual(list(splitlist(seq, 5)), [[1, 2, 3, 4]]) # Errornous cases self.assertRaises(ValueError, lambda: list(splitlist([], 0))) # Does it work for all iterables? self.assertEqual(list(splitlist(iter(seq), 3)), [[1, 2, 3], [4]])
def import_sentences(self): sentences = {} # aid -> {par, sent -> sent_id} for i, batch in enumerate(toolkit.splitlist(self._get_dicts("sentences.jsonl"), itemsperbatch=1000)): logging.info("Creating sentences for 1000 articles, batch {i}".format(**locals())) # check existing articles articles = {self.status.articles[d["article_id"]]: d["sentences"] for d in batch} _load_sentences(Sentence.objects.filter(article_id__in=articles) .values_list("article_id", "parnr", "sentnr", "pk") , target=sentences) to_add = list(self.get_sentences(articles, sentences)) if to_add: logging.info("Creating {} sentences".format(len(to_add))) added = ((s.article_id, s.parnr, s.sentnr, s.pk) for s in bulk_insert_returning_ids(to_add, fields=["*"])) _load_sentences(added, target=sentences) sentences.update(added) return sentences
def test_splitlist(self): def plusone(l): for i, e in enumerate(l): l[i] = e + 1 for input, output, itemsperbatch, options in ( ([1, 2, 3], [[1, 2], [3]], 2, {}), ([1, 2, 3], [[1, 2, 3]], 20, {}), ((1, 2, 3), [(1, 2), (3, )], 2, {}), ([1, 2, 3], [[2, 3], [4]], 2, dict(buffercall=plusone)), ((i for i in (1, 2, 3)), [[1, 2], [3]], 2, {}), ((i for i in (1, 2, 3)), [1, 2, 3], 2, dict(yieldelements=True)), ((i for i in (1, 2, 3)), [2, 3, 4], 2, dict(buffercall=plusone, yieldelements=True)), ): o = toolkit.splitlist(input, itemsperbatch, **options) self.assertEqual(list(o), output)
def test_splitlist(self): seq = [1, 2, 3, 4] # Test standard cases self.assertEqual(list(toolkit.splitlist([], 10)), []) self.assertEqual(list(toolkit.splitlist(seq, 1)), [[1], [2], [3], [4]]) self.assertEqual(list(toolkit.splitlist(seq, 2)), [[1, 2], [3, 4]]) self.assertEqual(list(toolkit.splitlist(seq, 3)), [[1, 2, 3], [4]]) self.assertEqual(list(toolkit.splitlist(seq, 5)), [[1, 2, 3, 4]]) # Errornous cases self.assertRaises(ValueError, lambda: list(toolkit.splitlist([], 0))) # Does it work for all iterables? self.assertEqual(list(toolkit.splitlist(iter(seq), 3)), [[1, 2, 3], [4]])
def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()): """ Bulk insert the given articles in batches of batch_size """ batches = list(toolkit.splitlist(dicts, itemsperbatch=batch_size)) if batch_size else [dicts] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals())) props, articles = set(), {} for d in batch: props |= (set(d.keys()) - ALL_FIELDS) articles[d["id"]] = serialize(d) self.check_properties(props) body = get_bulk_body(articles) resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp)
def delete(self, purge_orphans=True): "Delete the articleset and all articles from index and db" # which articles are only in this set? # check per N articles for aids in toolkit.splitlist( self.articles.values_list("pk", flat=True)): x = set( ArticleSetArticle.objects.filter(article_id__in=aids).exclude( articleset=self).values_list("article_id", flat=True)) todelete = set(aids) - x Article.objects.filter(pk__in=todelete).delete() amcates.ES().remove_from_set(self.id, aids) if purge_orphans: amcates.ES().purge_orphans() super(ArticleSet, self).delete() # cascade deletes all article references
def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: if monitor: monitor.update() return batches = list(splitlist(article_ids, itemsperbatch=1000)) monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(message="Adding batch {iplus}/{nbatches}..".format( iplus=i + 1, nbatches=nbatches)) self.bulk_update(batch, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid})
def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: if monitor: monitor.update() return batches = [set(batch) for batch in splitlist(article_ids, itemsperbatch=1000)] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(message="Adding batch {iplus}/{nbatches}..".format(iplus=i + 1, nbatches=nbatches)) missing = batch - set(self.in_index(batch)) if missing: logging.warning("Adding {} missing articles to elastic".format(len(missing))) self.add_articles(missing) if batch - missing: self.bulk_update(batch - missing, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid})
def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()): """ Bulk insert the given articles in batches of batch_size """ batches = list(toolkit.splitlist( dicts, itemsperbatch=batch_size)) if batch_size else [dicts] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update( 1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals())) props, articles = set(), {} for d in batch: props |= (set(d.keys()) - ALL_FIELDS) articles[d["id"]] = serialize(d) self.check_properties(props) body = get_bulk_body(articles) resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp)
def remove_from_set(self, setid, article_ids, flush=True): """Remove the given articles from the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return for batch in splitlist(article_ids, itemsperbatch=1000): self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set' : setid})
def add_to_set(self, setid, article_ids): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return for batch in splitlist(article_ids, itemsperbatch=1000): self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set' : setid})
def import_articles(self): def create_articles(batch): for a in batch: a['oldid_int'] = a.pop('old_id') if a['text'] == '': a['text'] = '-' if a['title'] == '': a['title'] = '-' articles = Article.create_articles([Article(project_id=self.status.project.id, **a) for a in batch]) self.status.articles.update({a.get_property('oldid_int'): a.id for a in articles}) return articles hashes = {} def create_articles_store_hashes(batch): arts = create_articles(batch) hashes.update({a.get_property('oldid_int'): a.hash for a in arts}) # save articles without parents for i, batch in enumerate(toolkit.splitlist(self._get_dicts("articles.jsonl"), itemsperbatch=1000)): logging.info("Import articles batch {i}".format(**locals())) create_articles(batch) # Do first pass of articles with parents in batches to avoid potential memory issues # (I'm assuming here that the number of 2+ depth children will not be too high) todo = [] hashes = {} # old_id: hash for j, batch in enumerate(toolkit.splitlist(self._get_dicts("articles_with_parents.jsonl"), itemsperbatch=1000)): logging.info("Iterating over articles with parents, batch {j}".format(**locals())) # sort children, create articles for direct children, remember parent structure for others known = [] for a in batch: a['parentid_int'] = a.pop('parent_id') if a['parentid_int'] in self.status.articles: known.append(a) else: todo.append(a) # retrieve parent hash and create articles for known parents if known: parents = dict(Article.objects.filter(pk__in={self.status.articles[a['parentid_int']] for a in known}) .values_list("pk", "hash")) for a in known: a['parent_hash'] = parents[self.status.articles[a['parentid_int']]] logging.info("Saving {} articles with known parents".format(len(known))) create_articles_store_hashes(known) logging.info("Direct children saved, {} articles to be dealt with".format(len(todo))) # deal with remaining children: (1) save 'real' orphans (ie parent not in this project) known_ids = {a["old_id"] for a in todo} new_todo, tosave = [], [] for a in todo: parent = a['parentid_int'] if parent not in known_ids and parent not in self.status.articles: logging.warning("Parent {parent} for article {aid} unknown, removing parent relation" .format(aid=a["old_id"], parent=parent)) tosave.append(a) else: new_todo.append(a) if tosave: logging.info("Saving {} articles without parents".format(len(tosave))) create_articles_store_hashes(tosave) # (2) make passes through articles until either done or no progress made logging.info("Real orphans saved, {} articles todo".format(len(new_todo))) while new_todo: todo, tosave, new_todo = new_todo, [], [] for a in todo: if a['parentid_int'] in hashes: a['parent_hash'] = hashes[a['parentid_int']] tosave.append(a) else: new_todo.append(a) if not tosave: logging.info("No articles to save, breaking; {} articles todo".format(len(new_todo))) break logging.info("Saving {} articles with found parents, {} articles todo".format(len(tosave), len(new_todo))) create_articles_store_hashes(tosave) # store remaining articles without parent, cycles are stupid anyway, right? if new_todo: logging.info("Data contained cycles, saving remaining {} articles without parents".format(len(new_todo))) create_articles(new_todo)
def _copy_articles(self, aids): for batch in splitlist(aids, itemsperbatch=1000): self._do_copy_articles(batch)
def distribute_tasks(tasks, action, nthreads=4, queue_size=10, retry_exceptions=False, batch_size=None, output_action=None): """ Distribute the elements in tasks over a nthreads threads using a queue. The trheads will call action(task) on each element in tasks. If action(task) raises an exception, the element is placed on the problem list. If retry_exceptions is non-False, after all elements are done the problematic elements are retried. Otherwise, the list of problems is returned. If batch_size is not None, will 'cut' tasks into batches of that size and place the sub-sequences on the queue If output_action is given, this function will be called from the worker thread for the result of each action """ starttime = time.time() count = 0 queue = Queue(queue_size) problems = [] log.debug("Creating and starting {nthreads} threads".format(**locals())) for i in range(nthreads): QueueProcessorThread(action, queue, problems, output_action, name="Worker_%i" % i).start() log.debug("Placing tasks on queue") if batch_size: for subset in toolkit.splitlist(tasks, batch_size): count += len(subset) queue.put(subset) else: for task in tasks: queue.put(task) count += 1 log.debug("Waiting until queue is empty") queue.join() while problems and retry_exceptions: log.debug('Retrying {n} problematic tasks'.format(n=len(problems))) # use a temporary list to hold problems and clear problems list before retrying _problems = problems[:] del problems[:] for problem in _problems: queue.put(problem) queue.join() if type(retry_exceptions) == int: retry_exceptions -= 1 queue.done = True total_time = time.time() - starttime rate = count / (total_time + .00001) log.debug( 'Processed {count} tasks in {total_time:.0f} seconds ({rate:.2f}/second)' .format(**locals())) return problems