Example #1
0
    def delete(self, purge_orphans=True):
        "Delete the articleset and all articles from index and db"
        # which articles are only in this set?
        # check per N articles
        
        log.warn("Getting all articles")

        aids = list(self.articles.values_list("pk", flat=True))
        todelete = set(aids)
        log.warn("Finding orphans in {} articles".format(len(aids)))
        for aids in toolkit.splitlist(aids, itemsperbatch=1000):
            x = set(ArticleSetArticle.objects.filter(article_id__in=aids).exclude(articleset=self)
                    .values_list("article_id", flat=True))
            todelete -= x
        log.warn("Removing {} orphans from DB".format(len(todelete)))
        #Article.objects.filter(pk__in=todelete).delete()
        for i, aids in enumerate(toolkit.splitlist(todelete, itemsperbatch=10000)):
            if i > 1:
                log.warn("... batch {i} (x10k)".format(**locals()))
            #Article.objects.filter(pk__in=aids)._raw_delete(Article.objects.db)
            Article.objects.filter(pk__in=aids).only("pk").delete()

        log.warn("Getting set membership from elastic")
        esaids = list(self.get_article_ids_from_elastic())
        if esaids:
            log.warn("Removing set membership from elastic ({} articles)".format(len(esaids)))
            amcates.ES().remove_from_set(self.id, esaids)

        if purge_orphans:
            amcates.ES().refresh()
            amcates.ES().purge_orphans()

        log.warn("Deleting set (and articlesetarticle references)")
        super(ArticleSet, self).delete() # cascade deletes all article references
        log.warn("Done!")
Example #2
0
    def run(self, _input):
        projects, sets = list(self.options["projects"]), list(
            self.options["sets"])
        if self.options["include_project_sets"]:
            sets += list(ArticleSet.objects.filter(project__in=projects))
        log.info(
            "Cleaning projects {projects}, sets {sets}".format(**locals()))

        articles = set()
        if projects:
            q = Article.objects.filter(project__in=projects)
            articles |= set(aid for (aid, ) in q.values_list("id"))
        if sets:
            q = ArticleSetArticle.objects.filter(articleset__in=sets)
            articles |= set(aid for (aid, ) in q.values_list("article_id"))

        if self.options["batch"]:
            log.info("Cleaning {n} articles in {m} batch(es) of {b}".format(
                n=len(articles),
                b=self.options["batch"],
                m=1 + len(articles) // self.options["batch"]))
            for i, articles in enumerate(
                    splitlist(articles, self.options["batch"])):
                log.info("Batch {i}: Cleaning {n} articles".format(
                    n=len(articles), **locals()))
                Solr().add_articles(articles)
        else:
            log.info("Cleaning {n} articles".format(n=len(articles)))
            Solr().add_articles(articles)

        log.info("Done!")
Example #3
0
 def exists(cls, article_ids, batch_size=500):
     """
     Filters the given articleids to remove non-existing ids
     """
     for batch in splitlist(article_ids, itemsperbatch=batch_size):
         for aid in Article.objects.filter(pk__in=batch).values_list("pk", flat=True):
             yield aid
Example #4
0
 def exists(cls, article_ids, batch_size=500):
     """
     Filters the given articleids to remove non-existing ids
     """
     for batch in splitlist(article_ids, itemsperbatch=batch_size):
         for aid in Article.objects.filter(pk__in=batch).values_list(
                 "pk", flat=True):
             yield aid
Example #5
0
 def add_to_set(self, setid, article_ids):
     """Add the given articles to the given set. This is done in batches, so there
     is no limit on the length of article_ids (which can be a generator)."""
     if not article_ids: return
     for batch in splitlist(article_ids, itemsperbatch=1000):
         self.bulk_update(article_ids,
                          UPDATE_SCRIPT_ADD_TO_SET,
                          params={'set': setid})
Example #6
0
 def remove_from_set(self, setid, article_ids, flush=True):
     """Remove the given articles from the given set. This is done in batches, so there
     is no limit on the length of article_ids (which can be a generator)."""
     if not article_ids: return
     for batch in splitlist(article_ids, itemsperbatch=1000):
         self.bulk_update(batch,
                          UPDATE_SCRIPT_REMOVE_FROM_SET,
                          params={'set': setid})
Example #7
0
 def test_splitlist(self):
     for input, output, itemsperbatch in (
         ([1,2,3], [[1,2], [3]], 2),
         ([1,2,3], [[1,2, 3]], 20),
         ((1,2,3), [(1,2), (3,)], 2),
         ((i for i in (1,2,3)), [[1,2],[3]], 2),
         ):
         o = toolkit.splitlist(input, itemsperbatch)
         self.assertEqual(list(o), output)
Example #8
0
 def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
     """Add the given articles to the given set. This is done in batches, so there
     is no limit on the length of article_ids (which can be a generator)."""
     if not article_ids: return
     batches = list(splitlist(article_ids, itemsperbatch=1000))
     nbatches = len(batches)
     for i, batch in enumerate(batches):
         monitor.update(40/nbatches, "Added batch {i}/{nbatches}".format(**locals()))
         self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set' : setid})
Example #9
0
 def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
     """Add the given articles to the given set. This is done in batches, so there
     is no limit on the length of article_ids (which can be a generator)."""
     if not article_ids: return
     batches = list(splitlist(article_ids, itemsperbatch=1000))
     nbatches = len(batches)
     for i, batch in enumerate(batches):
         monitor.update(40/nbatches, "Added batch {iplus}/{nbatches}".format(iplus=i+1, **locals()))
         self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set' : setid})
Example #10
0
 def test_splitlist(self):
     for input, output, itemsperbatch in (
         ([1, 2, 3], [[1, 2], [3]], 2),
         ([1, 2, 3], [[1, 2, 3]], 20),
         ((1, 2, 3), [(1, 2), (3, )], 2),
         ((i for i in (1, 2, 3)), [[1, 2], [3]], 2),
     ):
         o = toolkit.splitlist(input, itemsperbatch)
         self.assertEqual(list(o), output)
Example #11
0
 def _check_uuids(self, uuids):
     """Check which articles are already present in the destination database
     Returns a sequence of articles ids that are NOT present, ie that need to be copied"""
     for i, batch in enumerate(splitlist(uuids.keys(), itemsperbatch=10000)):
         log.info("({i}) Checking whether {n} uuids are present in destination database"
                  .format(n=len(batch), **locals()))
         
         present = {uuid for (uuid,) in Article.objects.filter(uuid__in=batch).values_list("uuid")}
         for uuid in set(batch) - present:
             yield uuids[uuid]
Example #12
0
    def _check_uuids(self, uuids):
        """Check which articles are already present in the destination database
        Returns a sequence of articles ids that are NOT present, ie that need to be copied"""
        for i, batch in enumerate(splitlist(uuids.keys(), itemsperbatch=10000)):
            log.info("({i}) Checking whether {n} uuids are present in destination database"
                     .format(n=len(batch), **locals()))

            present = {uuid for (uuid,) in Article.objects.filter(uuid__in=batch).values_list("uuid")}
            for uuid in set(batch) - present:
                yield uuids[uuid]
 def test_splitlist(self):
     def plusone(l):
         for i,e in enumerate(l):
             l[i] = e+1
     for input, output, itemsperbatch in (
         ([1,2,3], [[1,2], [3]], 2),
         ([1,2,3], [[1,2, 3]], 20),
         ((1,2,3), [(1,2), (3,)], 2),
         ((i for i in (1,2,3)), [[1,2],[3]], 2),
         ):
         o = toolkit.splitlist(input, itemsperbatch)
         self.assertEqual(list(o), output)
Example #14
0
    def delete(self, purge_orphans=True):
        "Delete the articleset and all articles from index and db"
        # which articles are only in this set?
        # check per N articles

        log.warn("Getting all articles")

        aids = list(self.articles.values_list("pk", flat=True))
        todelete = set(aids)
        log.warn("Finding orphans in {} articles".format(len(aids)))
        for aids in toolkit.splitlist(aids, itemsperbatch=1000):
            x = set(
                ArticleSetArticle.objects.filter(article_id__in=aids).exclude(
                    articleset=self).values_list("article_id", flat=True))
            todelete -= x
        log.warn("Removing {} orphans from DB".format(len(todelete)))
        #Article.objects.filter(pk__in=todelete).delete()
        for i, aids in enumerate(
                toolkit.splitlist(todelete, itemsperbatch=10000)):
            if i > 1:
                log.warn("... batch {i} (x10k)".format(**locals()))
            #Article.objects.filter(pk__in=aids)._raw_delete(Article.objects.db)
            Article.objects.filter(pk__in=aids).only("pk").delete()

        log.warn("Getting set membership from elastic")
        esaids = list(self.get_article_ids_from_elastic())
        if esaids:
            log.warn(
                "Removing set membership from elastic ({} articles)".format(
                    len(esaids)))
            amcates.ES().remove_from_set(self.id, esaids)

        if purge_orphans:
            amcates.ES().refresh()
            amcates.ES().purge_orphans()

        log.warn("Deleting set (and articlesetarticle references)")
        super(ArticleSet,
              self).delete()  # cascade deletes all article references
        log.warn("Done!")
Example #15
0
 def in_index(self, ids):
     """
     Check whether the given ids are already indexed.
     @return: a sequence of ids that are in the index
     """
     if not isinstance(ids, list): ids = list(ids)
     log.info("Checking existence of {nids} documents".format(nids=len(ids)))
     if not ids: return
     for batch in splitlist(ids, itemsperbatch=10000):
         result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE,
                               body={"ids": batch}, fields=[])
         for doc in result['docs']:
             if doc['found']: yield int(doc['_id'])
Example #16
0
 def in_index(self, ids):
     """
     Check whether the given ids are already indexed.
     @return: a sequence of ids that are in the index
     """
     if not isinstance(ids, list): ids = list(ids)
     log.info("Checking existence of {nids} documents".format(nids=len(ids)))
     if not ids: return
     for batch in splitlist(ids, itemsperbatch=10000):
         result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE,
                               body={"ids": batch}, fields=[])
         for doc in result['docs']:
             if doc['found']: yield int(doc['_id'])
Example #17
0
    def test_splitlist(self):
        def plusone(l):
            for i, e in enumerate(l):
                l[i] = e + 1

        for input, output, itemsperbatch in (
            ([1, 2, 3], [[1, 2], [3]], 2),
            ([1, 2, 3], [[1, 2, 3]], 20),
            ((1, 2, 3), [(1, 2), (3, )], 2),
            ((i for i in (1, 2, 3)), [[1, 2], [3]], 2),
        ):
            o = toolkit.splitlist(input, itemsperbatch)
            self.assertEqual(list(o), output)
Example #18
0
def _create_codingjob_batches(codingjob, article_ids, batch_size):
    name = codingjob.name

    for i, batch in enumerate(splitlist(article_ids, batch_size)):
        codingjob.pk = None
        codingjob.name = "{name} - {i}".format(i=i + 1, name=name)
        codingjob.articleset = ArticleSet.create_set(
            project=codingjob.project,
            name=codingjob.name,
            favourite=False,
            articles=batch,
        )

        codingjob.save()
        yield codingjob.pk
Example #19
0
 def add_articles(self, article_ids, batch_size = 1000):
     """
     Add the given article_ids to the index. This is done in batches, so there
     is no limit on the length of article_ids (which can be a generator).
     """
     if not article_ids: return
     from amcat.models import Article, ArticleSetArticle
     n = len(article_ids) / batch_size
     for i, batch in enumerate(splitlist(article_ids, itemsperbatch=batch_size)):
         log.info("Adding batch {i}/{n}".format(**locals()))
         all_sets = multidict((aa.article_id, aa.articleset_id)
                              for aa in ArticleSetArticle.objects.filter(article__in=batch))
         dicts = (get_article_dict(article, list(all_sets.get(article.id, [])))
                  for article in Article.objects.filter(pk__in=batch))
         self.bulk_insert(dicts)
Example #20
0
    def delete(self, purge_orphans=True):
        "Delete the articleset and all articles from index and db"
        # which articles are only in this set?
        # check per N articles
        for aids in toolkit.splitlist(self.articles.values_list("pk", flat=True)):
            x = set(ArticleSetArticle.objects.filter(article_id__in=aids).exclude(articleset=self)
                    .values_list("article_id", flat=True))
            todelete = set(aids) - x
            Article.objects.filter(pk__in=todelete).delete()
            amcates.ES().remove_from_set(self.id, aids)

        if purge_orphans:
            amcates.ES().purge_orphans()

        super(ArticleSet, self).delete() # cascade deletes all article references
Example #21
0
def _create_codingjob_batches(codingjob, article_ids, batch_size):
    name = codingjob.name

    for i, batch in enumerate(splitlist(article_ids, batch_size)):
        codingjob.pk = None
        codingjob.name = "{name} - {i}".format(i=i+1, name=name)
        codingjob.articleset = ArticleSet.create_set(
            project=codingjob.project,
            name=codingjob.name,
            favourite=False,
            articles=batch,
        )

        codingjob.save()
        yield codingjob.pk
Example #22
0
    def add_articles(self, article_ids, batch_size=1000):
        """
        Add the given article_ids to the index. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator).
        """
        if not article_ids: return
        from amcat.models import Article, ArticleSetArticle

        n = len(article_ids) / batch_size
        for i, batch in enumerate(splitlist(article_ids, itemsperbatch=batch_size)):
            log.info("Adding batch {i}/{n}".format(**locals()))
            all_sets = multidict((aa.article_id, aa.articleset_id)
                                 for aa in ArticleSetArticle.objects.filter(article__in=batch))
            dicts = (get_article_dict(article, list(all_sets.get(article.id, [])))
                     for article in Article.objects.filter(pk__in=batch))
            self.bulk_insert(dicts)
Example #23
0
    def test_splitlist(self):
        seq = [1, 2, 3, 4]

        # Test standard cases
        self.assertEqual(list(splitlist([], 10)), [])
        self.assertEqual(list(splitlist(seq, 1)), [[1], [2], [3], [4]])
        self.assertEqual(list(splitlist(seq, 2)), [[1, 2], [3, 4]])
        self.assertEqual(list(splitlist(seq, 3)), [[1, 2, 3], [4]])
        self.assertEqual(list(splitlist(seq, 5)), [[1, 2, 3, 4]])

        # Errornous cases
        self.assertRaises(ValueError, lambda: list(splitlist([], 0)))

        # Does it work for all iterables?
        self.assertEqual(list(splitlist(iter(seq), 3)), [[1, 2, 3], [4]])
Example #24
0
 def import_sentences(self):
     sentences = {}  # aid -> {par, sent -> sent_id}
     for i, batch in enumerate(toolkit.splitlist(self._get_dicts("sentences.jsonl"), itemsperbatch=1000)):
         logging.info("Creating sentences for 1000 articles, batch {i}".format(**locals()))
         # check existing articles
         articles = {self.status.articles[d["article_id"]]: d["sentences"] for d in batch}
         _load_sentences(Sentence.objects.filter(article_id__in=articles)
                         .values_list("article_id", "parnr", "sentnr", "pk")
                         , target=sentences)
         to_add = list(self.get_sentences(articles, sentences))
         if to_add:
             logging.info("Creating {} sentences".format(len(to_add)))
             added = ((s.article_id, s.parnr, s.sentnr, s.pk) for s in bulk_insert_returning_ids(to_add,
                                                                                                 fields=["*"]))
             _load_sentences(added, target=sentences)
             sentences.update(added)
     return sentences
Example #25
0
    def test_splitlist(self):
        def plusone(l):
            for i, e in enumerate(l):
                l[i] = e + 1

        for input, output, itemsperbatch, options in (
            ([1, 2, 3], [[1, 2], [3]], 2, {}),
            ([1, 2, 3], [[1, 2, 3]], 20, {}),
            ((1, 2, 3), [(1, 2), (3, )], 2, {}),
            ([1, 2, 3], [[2, 3], [4]], 2, dict(buffercall=plusone)),
            ((i for i in (1, 2, 3)), [[1, 2], [3]], 2, {}),
            ((i for i in (1, 2, 3)), [1, 2, 3], 2, dict(yieldelements=True)),
            ((i for i in (1, 2, 3)), [2, 3, 4], 2,
             dict(buffercall=plusone, yieldelements=True)),
        ):
            o = toolkit.splitlist(input, itemsperbatch, **options)
            self.assertEqual(list(o), output)
Example #26
0
    def test_splitlist(self):
        seq = [1, 2, 3, 4]

        # Test standard cases
        self.assertEqual(list(toolkit.splitlist([], 10)), [])
        self.assertEqual(list(toolkit.splitlist(seq, 1)), [[1], [2], [3], [4]])
        self.assertEqual(list(toolkit.splitlist(seq, 2)), [[1, 2], [3, 4]])
        self.assertEqual(list(toolkit.splitlist(seq, 3)), [[1, 2, 3], [4]])
        self.assertEqual(list(toolkit.splitlist(seq, 5)), [[1, 2, 3, 4]])

        # Errornous cases
        self.assertRaises(ValueError, lambda: list(toolkit.splitlist([], 0)))

        # Does it work for all iterables?
        self.assertEqual(list(toolkit.splitlist(iter(seq), 3)), [[1, 2, 3], [4]])
Example #27
0
 def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()):
     """
     Bulk insert the given articles in batches of batch_size
     """
     batches = list(toolkit.splitlist(dicts, itemsperbatch=batch_size)) if batch_size else [dicts]
     monitor = monitor.submonitor(total=len(batches))
     nbatches = len(batches)
     for i, batch in enumerate(batches):
         monitor.update(1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals()))
         props, articles = set(), {}
         for d in batch:
             props |= (set(d.keys()) - ALL_FIELDS)
             articles[d["id"]] = serialize(d)
         self.check_properties(props)
         body = get_bulk_body(articles)
         resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE)
         if resp["errors"]:
             raise ElasticSearchError(resp)
Example #28
0
    def delete(self, purge_orphans=True):
        "Delete the articleset and all articles from index and db"
        # which articles are only in this set?
        # check per N articles
        for aids in toolkit.splitlist(
                self.articles.values_list("pk", flat=True)):
            x = set(
                ArticleSetArticle.objects.filter(article_id__in=aids).exclude(
                    articleset=self).values_list("article_id", flat=True))
            todelete = set(aids) - x
            Article.objects.filter(pk__in=todelete).delete()
            amcates.ES().remove_from_set(self.id, aids)

        if purge_orphans:
            amcates.ES().purge_orphans()

        super(ArticleSet,
              self).delete()  # cascade deletes all article references
Example #29
0
    def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
        """Add the given articles to the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""

        if not article_ids:
            if monitor:
                monitor.update()
            return

        batches = list(splitlist(article_ids, itemsperbatch=1000))
        monitor = monitor.submonitor(total=len(batches))

        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(message="Adding batch {iplus}/{nbatches}..".format(
                iplus=i + 1, nbatches=nbatches))
            self.bulk_update(batch,
                             UPDATE_SCRIPT_ADD_TO_SET,
                             params={'set': setid})
Example #30
0
    def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
        """Add the given articles to the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""

        if not article_ids:
            if monitor:
                monitor.update()
            return

        batches = [set(batch) for batch in splitlist(article_ids, itemsperbatch=1000)]
        monitor = monitor.submonitor(total=len(batches))

        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(message="Adding batch {iplus}/{nbatches}..".format(iplus=i + 1, nbatches=nbatches))
            missing = batch - set(self.in_index(batch))
            if missing:
                logging.warning("Adding {} missing articles to elastic".format(len(missing)))
                self.add_articles(missing)
            if batch - missing:
                self.bulk_update(batch - missing, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid})
Example #31
0
 def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()):
     """
     Bulk insert the given articles in batches of batch_size
     """
     batches = list(toolkit.splitlist(
         dicts, itemsperbatch=batch_size)) if batch_size else [dicts]
     monitor = monitor.submonitor(total=len(batches))
     nbatches = len(batches)
     for i, batch in enumerate(batches):
         monitor.update(
             1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1,
                                                         **locals()))
         props, articles = set(), {}
         for d in batch:
             props |= (set(d.keys()) - ALL_FIELDS)
             articles[d["id"]] = serialize(d)
         self.check_properties(props)
         body = get_bulk_body(articles)
         resp = self.es.bulk(body=body,
                             index=self.index,
                             doc_type=settings.ES_ARTICLE_DOCTYPE)
         if resp["errors"]:
             raise ElasticSearchError(resp)
Example #32
0
 def remove_from_set(self, setid, article_ids, flush=True):
     """Remove the given articles from the given set. This is done in batches, so there
     is no limit on the length of article_ids (which can be a generator)."""
     if not article_ids: return
     for batch in splitlist(article_ids, itemsperbatch=1000):
         self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set' : setid})
Example #33
0
 def add_to_set(self, setid, article_ids):
     """Add the given articles to the given set. This is done in batches, so there
     is no limit on the length of article_ids (which can be a generator)."""
     if not article_ids: return
     for batch in splitlist(article_ids, itemsperbatch=1000):
         self.bulk_update(article_ids, UPDATE_SCRIPT_ADD_TO_SET, params={'set' : setid})
Example #34
0
    def import_articles(self):
        def create_articles(batch):
            for a in batch:
                a['oldid_int'] = a.pop('old_id')
                if a['text'] == '': a['text'] = '-'
                if a['title'] == '': a['title'] = '-'
            articles = Article.create_articles([Article(project_id=self.status.project.id, **a) for a in batch])
            self.status.articles.update({a.get_property('oldid_int'): a.id for a in articles})
            return articles
        hashes = {}
        def create_articles_store_hashes(batch):
            arts = create_articles(batch)
            hashes.update({a.get_property('oldid_int'): a.hash for a in arts})

        # save articles without parents
        for i, batch in enumerate(toolkit.splitlist(self._get_dicts("articles.jsonl"), itemsperbatch=1000)):
            logging.info("Import articles batch {i}".format(**locals()))
            create_articles(batch)

        # Do first pass of articles with parents in batches to avoid potential memory issues
        # (I'm assuming here that the number of 2+ depth children will not be too high)
        todo = []
        hashes = {}  # old_id: hash
        for j, batch in enumerate(toolkit.splitlist(self._get_dicts("articles_with_parents.jsonl"),
                                                    itemsperbatch=1000)):
            logging.info("Iterating over articles with parents, batch {j}".format(**locals()))
            # sort children, create articles for direct children, remember parent structure for others
            known = []
            for a in batch:
                a['parentid_int'] = a.pop('parent_id')
                if a['parentid_int'] in self.status.articles:
                    known.append(a)
                else:
                    todo.append(a)
            # retrieve parent hash and create articles for known parents
            if known:
                parents = dict(Article.objects.filter(pk__in={self.status.articles[a['parentid_int']] for a in known})
                               .values_list("pk", "hash"))
                for a in known:
                    a['parent_hash'] = parents[self.status.articles[a['parentid_int']]]
                logging.info("Saving {} articles with known parents".format(len(known)))
                create_articles_store_hashes(known)

        logging.info("Direct children saved, {} articles to be dealt with".format(len(todo)))
        # deal with remaining children: (1) save 'real' orphans (ie parent not in this project)
        known_ids = {a["old_id"] for a in todo}
        new_todo, tosave = [], []
        for a in todo:
            parent = a['parentid_int']
            if parent not in known_ids and parent not in self.status.articles:
                logging.warning("Parent {parent} for article {aid} unknown, removing parent relation"
                                .format(aid=a["old_id"], parent=parent))
                tosave.append(a)
            else:
                new_todo.append(a)
        if tosave:
            logging.info("Saving {} articles without parents".format(len(tosave)))
            create_articles_store_hashes(tosave)

        # (2) make passes through articles until either done or no progress made
        logging.info("Real orphans saved, {} articles todo".format(len(new_todo)))
        while new_todo:
            todo, tosave, new_todo = new_todo, [], []
            for a in todo:
                if a['parentid_int'] in hashes:
                    a['parent_hash'] = hashes[a['parentid_int']]
                    tosave.append(a)
                else:
                    new_todo.append(a)
            if not tosave:
                logging.info("No articles to save, breaking; {} articles todo".format(len(new_todo)))
                break
            logging.info("Saving {} articles with found parents, {} articles todo".format(len(tosave), len(new_todo)))
            create_articles_store_hashes(tosave)

        # store remaining articles without parent, cycles are stupid anyway, right?
        if new_todo:
            logging.info("Data contained cycles, saving remaining {} articles without parents".format(len(new_todo)))
            create_articles(new_todo)
Example #35
0
 def _copy_articles(self, aids):
     for batch in splitlist(aids, itemsperbatch=1000):
         self._do_copy_articles(batch)
Example #36
0
 def _copy_articles(self, aids):
     for batch in splitlist(aids, itemsperbatch=1000):
         self._do_copy_articles(batch)
Example #37
0
def distribute_tasks(tasks,
                     action,
                     nthreads=4,
                     queue_size=10,
                     retry_exceptions=False,
                     batch_size=None,
                     output_action=None):
    """
    Distribute the elements in tasks over a nthreads threads using a queue.
    The trheads will call action(task) on each element in tasks.

    If action(task) raises an exception, the element is placed on the problem
    list. If retry_exceptions is non-False, after all elements are done the problematic
    elements are retried. Otherwise, the list of problems is returned.

    If batch_size is not None, will 'cut' tasks into batches of that size and
    place the sub-sequences on the queue

    If output_action is given, this function will be called from the worker thread
    for the result of each action
    """
    starttime = time.time()
    count = 0
    queue = Queue(queue_size)
    problems = []

    log.debug("Creating and starting {nthreads} threads".format(**locals()))
    for i in range(nthreads):
        QueueProcessorThread(action,
                             queue,
                             problems,
                             output_action,
                             name="Worker_%i" % i).start()

    log.debug("Placing tasks on queue")
    if batch_size:
        for subset in toolkit.splitlist(tasks, batch_size):
            count += len(subset)
            queue.put(subset)
    else:
        for task in tasks:
            queue.put(task)
            count += 1

    log.debug("Waiting until queue is empty")
    queue.join()

    while problems and retry_exceptions:
        log.debug('Retrying {n} problematic tasks'.format(n=len(problems)))
        # use a temporary list to hold problems and clear problems list before retrying
        _problems = problems[:]
        del problems[:]
        for problem in _problems:
            queue.put(problem)
        queue.join()
        if type(retry_exceptions) == int:
            retry_exceptions -= 1

    queue.done = True

    total_time = time.time() - starttime
    rate = count / (total_time + .00001)
    log.debug(
        'Processed {count} tasks in {total_time:.0f} seconds ({rate:.2f}/second)'
        .format(**locals()))

    return problems