def _run(self, job_size, articleset, name, project, **args): CreateSentences(dict(articlesets=[articleset.id])).run() job = self.bound_form.save(commit=False) if not job_size: job.articleset = ArticleSet.create_set( project=project, name=name, articles=articleset.articles.all(), favourite=False) job.save() return job n = articleset.articles.count() result = [] for i, start in enumerate(range(0, n, job_size)): job.pk = None articles = articleset.articles.all()[start:start + job_size] set_name = "{name} - {j}".format(j=i + 1, **locals()) job.articleset = ArticleSet.create_set(project=project, articles=articles, name=set_name, favourite=False) job.name = "{name} - {j}".format(j=i + 1, **locals()) job.save() result.append(CodingJob.objects.get(pk=job.pk)) return result
def handle_split(form, project, article, sentences): if not form.is_valid(): raise ValueError("Non-valid form passed: {form.errors}".format(**locals())) articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids for art in articles: art.save() sbd.create_sentences(art) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Keep a list of touched sets, so we can invalidate their indices dirty_sets = ArticleSet.objects.none() # Add splitted articles to existing sets ArticleSet.articles.through.objects.bulk_create([ ArticleSet.articles.through(articleset=aset, article=art) for art in articles for aset in form_data["add_splitted_to_sets"] ]) # Collect changed sets for field in ("add_splitted_to_sets", "remove_from_sets", "add_to_sets"): dirty_sets |= form_data[field] # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: articlesetarts = ArticleSet.articles.through.objects.filter(article=article, articleset__project=project) ArticleSet.articles.through.objects.bulk_create([ ArticleSet.articles.through(articleset=asetart.articleset, article=art) for art in articles for asetart in articlesetarts ]) dirty_sets |= project.all_articlesets().filter(articles=article).only("id") if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids for art in articles: art.save() sbd.create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Add splitted articles to existing sets for aset in form_data["add_splitted_to_sets"]: aset.add_articles(articles) # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: asets = project.all_articlesets().filter(articles=article).only("id") for aset in asets: aset.add_articles(articles) if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set( project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def _run(self, articleset, save_duplicates_to, dry_run, ignore_fields, **_): hashes = collections.defaultdict(set) for i, (id, h) in enumerate(self.hash_articles(articleset, set(ignore_fields))): if not i % 100000: logging.info("Collecting hashes, n={i}, |hashes|={n}".format(n=len(hashes), **locals())) hashes[h].add(id) hashes = {hash: ids for (hash, ids) in hashes.items() if len(ids) > 1} logging.info("Duplicates founds for {} articles".format(len(hashes))) to_remove = set() logging.info("Iterating over hashes") for i, (hash, ids) in enumerate(hashes.items()): if dry_run: logging.info("Duplicates: {ids}".format(**locals())) to_remove |= set(sorted(ids)[1:]) if not i % 100000: logging.info("Iterating over hashes {i}/{n}, |to_remove|={m}".format(n=len(hashes), m=len(to_remove), **locals())) n = len(to_remove) if not to_remove: logging.info("No duplicates found!") else: if dry_run: logging.info("{n} duplicate articles found, run without dry_run to remove".format(**locals())) else: logging.info("Removing {n} articles from set".format(**locals())) articleset.remove_articles(to_remove) if save_duplicates_to: dupes_article_set = ArticleSet.create_set(articleset.project, save_duplicates_to, to_remove) return n, dry_run
def _run(self, articleset, save_duplicates_to, dry_run, **_): hashes = collections.defaultdict(set) for i, (id, h) in enumerate(self.get_hashes()): if not i%100000: logging.info("Collecting hashes, n={i}, |hashes|={n}".format(n=len(hashes), **locals())) hashes[h].add(id) hashes = {hash: ids for (hash, ids) in hashes.iteritems() if len(ids)>1} logging.info("Duplicates founds for {} articles".format(len(hashes))) to_remove = set() logging.info("Iterating over hashes") for i, (hash, ids) in enumerate(hashes.iteritems()): if dry_run: logging.info("Duplicates: {ids}".format(**locals())) to_remove |= set(sorted(ids)[1:]) if not i % 100000: logging.info("Iterating over hashes {i}/{n}, |to_remove|={m}".format(n=len(hashes), m=len(to_remove), **locals())) n = len(to_remove) if not to_remove: logging.info("No duplicates found!") else: if dry_run: logging.info("{n} duplicate articles found, run without dry_run to remove".format(**locals())) else: logging.info("Removing {n} articles from set".format(**locals())) articleset.remove_articles(to_remove) if save_duplicates_to: dupes_article_set = ArticleSet.create_set(articleset.project, save_duplicates_to, to_remove) return n, dry_run
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids Article.create_articles(articles) for art in articles: sbd.get_or_create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Add splitted articles to existing sets for aset in form_data["add_splitted_to_sets"]: aset.add_articles(articles) # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: asets = project.all_articlesets().filter(articles=article).only("id") for aset in asets: aset.add_articles(articles) if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def _run(self, job_size, articleset, name, project, **args): article_ids = articleset.articles.all().values_list("id", flat=True) job = self.bound_form.save(commit=False) if not job_size: job.articleset = ArticleSet.create_set(project=project, name=name, articles=article_ids, favourite=False) job.save() return job return create_codingjob_batches(job, article_ids, job_size)
def _run(self, job_size, articleset, name, project, **args): CreateSentences(dict(articlesets=[articleset.id])).run() job = self.bound_form.save(commit=False) if not job_size: job.articleset = ArticleSet.create_set(project=project, name=name, articles=articleset.articles.all()) job.save() return job n = articleset.articles.count() result = [] for i, start in enumerate(range(0, n, job_size)): job.pk = None job.articleset = ArticleSet.create_set(project=project, articles=articleset.articles.all()[start : start + job_size], name="{name} - {j}".format(j=i+1, **locals())) job.name = "{name} - {j}".format(j=i+1, **locals()) job.save() result.append(CodingJob.objects.get(pk=job.pk)) return result
def _create_codingjob_batches(codingjob, article_ids, batch_size): name = codingjob.name for i, batch in enumerate(splitlist(article_ids, batch_size)): codingjob.pk = None codingjob.name = "{name} - {i}".format(i=i+1, name=name) codingjob.articleset = ArticleSet.create_set( project=codingjob.project, name=codingjob.name, favourite=False, articles=batch, ) codingjob.save() yield codingjob.pk
def _create_codingjob_batches(codingjob, article_ids, batch_size): name = codingjob.name for i, batch in enumerate(splitlist(article_ids, batch_size)): codingjob.pk = None codingjob.name = "{name} - {i}".format(i=i + 1, name=name) codingjob.articleset = ArticleSet.create_set( project=codingjob.project, name=codingjob.name, favourite=False, articles=batch, ) codingjob.save() yield codingjob.pk
def _run(self, articleset, save_duplicates_to, dry_run, **kwargs): all_dupes = {} dupes_save_set = None log.debug("Deduplicating {articleset.id}".format(**locals())) for date in ES().list_dates(filters={"sets": articleset}): log.debug("Getting duplicates for {date}".format(**locals())) dupes = dict(self.get_duplicates(date)) if dupes: all_dupes.update(dupes) todelete = list(itertools.chain(*dupes.values())) if not dry_run: articleset.remove_articles(todelete) if save_duplicates_to: if dupes_save_set is None: dupes_save_set = ArticleSet.create_set(articleset.project, save_duplicates_to) dupes_save_set.add_articles(todelete) log.debug("Deleted dupes for {} articles".format(len(all_dupes))) return all_dupes
def _run(self, articleset, save_duplicates_to, dry_run, **kwargs): all_dupes = {} dupes_save_set = None log.debug("Deduplicating {articleset.id}".format(**locals())) for date in ES().list_dates(filters={"sets": articleset}): log.debug("Getting duplicates for {date}".format(**locals())) dupes = dict(self.get_duplicates(date)) if dupes: all_dupes.update(dupes) todelete = list(itertools.chain(*dupes.values())) if not dry_run: articleset.remove_articles(todelete) if save_duplicates_to: if dupes_save_set is None: dupes_save_set = ArticleSet.create_set( articleset.project, save_duplicates_to) dupes_save_set.add_articles(todelete) log.debug("Deleted dupes for {} articles".format(len(all_dupes))) return all_dupes
def handle_split(form, project, article, sentences): if not form.is_valid(): raise ValueError( "Non-valid form passed: {form.errors}".format(**locals())) articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids for art in articles: art.save() sbd.create_sentences(art) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Keep a list of touched sets, so we can invalidate their indices dirty_sets = ArticleSet.objects.none() # Add splitted articles to existing sets ArticleSet.articles.through.objects.bulk_create([ ArticleSet.articles.through(articleset=aset, article=art) for art in articles for aset in form_data["add_splitted_to_sets"] ]) # Collect changed sets for field in ("add_splitted_to_sets", "remove_from_sets", "add_to_sets"): dirty_sets |= form_data[field] # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: articlesetarts = ArticleSet.articles.through.objects.filter( article=article, articleset__project=project) ArticleSet.articles.through.objects.bulk_create([ ArticleSet.articles.through(articleset=asetart.articleset, article=art) for art in articles for asetart in articlesetarts ]) dirty_sets |= project.all_articlesets().filter( articles=article).only("id") if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set( project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()