コード例 #1
0
ファイル: add_codingjob.py プロジェクト: pombredanne/amcat
    def _run(self, job_size, articleset, name, project, **args):
        CreateSentences(dict(articlesets=[articleset.id])).run()
        job = self.bound_form.save(commit=False)

        if not job_size:
            job.articleset = ArticleSet.create_set(
                project=project,
                name=name,
                articles=articleset.articles.all(),
                favourite=False)
            job.save()
            return job

        n = articleset.articles.count()
        result = []
        for i, start in enumerate(range(0, n, job_size)):
            job.pk = None
            articles = articleset.articles.all()[start:start + job_size]
            set_name = "{name} - {j}".format(j=i + 1, **locals())
            job.articleset = ArticleSet.create_set(project=project,
                                                   articles=articles,
                                                   name=set_name,
                                                   favourite=False)
            job.name = "{name} - {j}".format(j=i + 1, **locals())
            job.save()
            result.append(CodingJob.objects.get(pk=job.pk))
        return result
コード例 #2
0
ファイル: article.py プロジェクト: kasperwelbers/amcat
def handle_split(form, project, article, sentences):
    if not form.is_valid():
        raise ValueError("Non-valid form passed: {form.errors}".format(**locals()))

    articles = list(get_articles(article, sentences))

    # We won't use bulk_create yet, as it bypasses save() and doesn't
    # insert ids
    for art in articles:
        art.save()
        sbd.create_sentences(art)

    # Context variables for template
    form_data = form.cleaned_data 
    all_sets = list(project.all_articlesets().filter(articles=article))

    # Keep a list of touched sets, so we can invalidate their indices
    dirty_sets = ArticleSet.objects.none()

    # Add splitted articles to existing sets
    ArticleSet.articles.through.objects.bulk_create([
        ArticleSet.articles.through(articleset=aset, article=art) for
            art in articles for aset in form_data["add_splitted_to_sets"]
    ])

    # Collect changed sets
    for field in ("add_splitted_to_sets", "remove_from_sets", "add_to_sets"):
        dirty_sets |= form_data[field]

    # Add splitted articles to sets wherin the original article live{d,s}
    if form_data["add_splitted_to_all"]:
        articlesetarts = ArticleSet.articles.through.objects.filter(article=article, articleset__project=project)

        ArticleSet.articles.through.objects.bulk_create([
            ArticleSet.articles.through(articleset=asetart.articleset, article=art)
                for art in articles for asetart in articlesetarts
        ])

        dirty_sets |= project.all_articlesets().filter(articles=article).only("id")

    if form_data["remove_from_sets"]:
        for aset in form_data["remove_from_sets"]:
            aset.remove_articles([article])
        
    if form_data["remove_from_all_sets"]:
        for aset in ArticleSet.objects.filter(project=project, articles=article).distinct():
            aset.remove_articles([article])

    if form_data["add_splitted_to_new_set"]:
        new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles)

    if form_data["add_to_sets"]:
        for articleset in form_data["add_to_sets"]:
            articleset.add_articles([article])

    if form_data["add_to_new_set"]:
        new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article])

    return locals()
コード例 #3
0
ファイル: article_views.py プロジェクト: pombredanne/amcat
def handle_split(form, project, article, sentences):
    articles = list(get_articles(article, sentences))

    # We won't use bulk_create yet, as it bypasses save() and doesn't
    # insert ids
    for art in articles:
        art.save()
        sbd.create_sentences(art)

    if not form.is_valid():
        raise ValueError("Form invalid: {form.errors}".format(**locals()))

    # Context variables for template
    form_data = form.cleaned_data
    all_sets = list(project.all_articlesets().filter(articles=article))

    # Add splitted articles to existing sets
    for aset in form_data["add_splitted_to_sets"]:
        aset.add_articles(articles)

    # Add splitted articles to sets wherin the original article live{d,s}
    if form_data["add_splitted_to_all"]:
        asets = project.all_articlesets().filter(articles=article).only("id")
        for aset in asets:
            aset.add_articles(articles)

    if form_data["remove_from_sets"]:
        for aset in form_data["remove_from_sets"]:
            aset.remove_articles([article])

    if form_data["remove_from_all_sets"]:
        for aset in ArticleSet.objects.filter(project=project,
                                              articles=article).distinct():
            aset.remove_articles([article])

    if form_data["add_splitted_to_new_set"]:
        new_splitted_set = ArticleSet.create_set(
            project, form_data["add_splitted_to_new_set"], articles)

    if form_data["add_to_sets"]:
        for articleset in form_data["add_to_sets"]:
            articleset.add_articles([article])

    if form_data["add_to_new_set"]:
        new_set = ArticleSet.create_set(project, form_data["add_to_new_set"],
                                        [article])

    return locals()
コード例 #4
0
    def _run(self, articleset, save_duplicates_to, dry_run, **_):
        hashes = collections.defaultdict(set)
        for i, (id, h) in enumerate(self.get_hashes()):
            if not i%100000:
                logging.info("Collecting hashes, n={i}, |hashes|={n}".format(n=len(hashes), **locals()))
            hashes[h].add(id)

        hashes = {hash: ids for (hash, ids) in hashes.iteritems() if len(ids)>1}
        logging.info("Duplicates founds for {} articles".format(len(hashes)))

        to_remove = set()
        logging.info("Iterating over hashes")
        for i, (hash, ids) in enumerate(hashes.iteritems()):
            if dry_run:
                logging.info("Duplicates: {ids}".format(**locals()))
            to_remove |= set(sorted(ids)[1:])
            if not i % 100000:
                logging.info("Iterating over hashes {i}/{n}, |to_remove|={m}".format(n=len(hashes), m=len(to_remove), **locals()))

        n = len(to_remove)
        if not to_remove:
            logging.info("No duplicates found!")
        else:
            if dry_run:
                logging.info("{n} duplicate articles found, run without dry_run to remove".format(**locals()))
            else:
                logging.info("Removing {n} articles from set".format(**locals()))
                articleset.remove_articles(to_remove)
            if save_duplicates_to:
                dupes_article_set = ArticleSet.create_set(articleset.project, save_duplicates_to, to_remove)
        return n, dry_run
コード例 #5
0
ファイル: deduplicate_set.py プロジェクト: amcat/amcat
    def _run(self, articleset, save_duplicates_to, dry_run, ignore_fields, **_):
        hashes = collections.defaultdict(set)
        for i, (id, h) in enumerate(self.hash_articles(articleset, set(ignore_fields))):
            if not i % 100000:
                logging.info("Collecting hashes, n={i}, |hashes|={n}".format(n=len(hashes), **locals()))
            hashes[h].add(id)

        hashes = {hash: ids for (hash, ids) in hashes.items() if len(ids) > 1}
        logging.info("Duplicates founds for {} articles".format(len(hashes)))

        to_remove = set()
        logging.info("Iterating over hashes")
        for i, (hash, ids) in enumerate(hashes.items()):
            if dry_run:
                logging.info("Duplicates: {ids}".format(**locals()))
            to_remove |= set(sorted(ids)[1:])
            if not i % 100000:
                logging.info("Iterating over hashes {i}/{n}, |to_remove|={m}".format(n=len(hashes), m=len(to_remove),
                                                                                     **locals()))

        n = len(to_remove)
        if not to_remove:
            logging.info("No duplicates found!")
        else:
            if dry_run:
                logging.info("{n} duplicate articles found, run without dry_run to remove".format(**locals()))
            else:
                logging.info("Removing {n} articles from set".format(**locals()))
                articleset.remove_articles(to_remove)
            if save_duplicates_to:
                dupes_article_set = ArticleSet.create_set(articleset.project, save_duplicates_to, to_remove)
        return n, dry_run
コード例 #6
0
ファイル: deduplicate_set.py プロジェクト: amcat/amcat
    def hash_articles(cls, articleset: ArticleSet, ignore_fields: set) -> Iterable[Tuple[int, str]]:
        """
        Finds all articles in an articleset, and hashes articles as a tuple of field values, ordered alphabetically
        by field name. Fields in ignore_fields will not affect the hash.
        Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the
        values of thoses fields are equal in both articles.

        @param articleset       The articleset that is to be searched
        @param ignore_fields    A set of fields that should not be included in the calculated hashes

        @return                 An iterable of (<article_id>, <hash>) tuples.
        """
        all_fields = STATIC_FIELDS + list(articleset.get_used_properties())

        if not ignore_fields:
            fields = ["hash"]
        else:
            fields = sorted(f for f in all_fields if not f in ignore_fields)

        x = amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}},
                              fields=fields)
        for x in amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}},
                                   _source=fields):
            if not ignore_fields:
                yield int(x['_id']), x['_source']['hash']
                continue
            art_tuple = tuple(str(x['_source'].get(k, [None])) for k in fields)
            hash = hash_class(repr(art_tuple).encode()).hexdigest()
            yield int(x['_id']), hash
コード例 #7
0
ファイル: article_views.py プロジェクト: aemal/amcat
def handle_split(form, project, article, sentences):
    articles = list(get_articles(article, sentences))

    # We won't use bulk_create yet, as it bypasses save() and doesn't
    # insert ids
    Article.create_articles(articles)
    for art in articles:
        sbd.get_or_create_sentences(art)

    if not form.is_valid():
        raise ValueError("Form invalid: {form.errors}".format(**locals()))

    # Context variables for template
    form_data = form.cleaned_data
    all_sets = list(project.all_articlesets().filter(articles=article))

    # Add splitted articles to existing sets
    for aset in form_data["add_splitted_to_sets"]:
        aset.add_articles(articles)

    # Add splitted articles to sets wherin the original article live{d,s}
    if form_data["add_splitted_to_all"]:
        asets = project.all_articlesets().filter(articles=article).only("id")
        for aset in asets:
            aset.add_articles(articles)

    if form_data["remove_from_sets"]:
        for aset in form_data["remove_from_sets"]:
            aset.remove_articles([article])

    if form_data["remove_from_all_sets"]:
        for aset in ArticleSet.objects.filter(project=project, articles=article).distinct():
            aset.remove_articles([article])

    if form_data["add_splitted_to_new_set"]:
        new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles)

    if form_data["add_to_sets"]:
        for articleset in form_data["add_to_sets"]:
            articleset.add_articles([article])

    if form_data["add_to_new_set"]:
        new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article])

    return locals()
コード例 #8
0
ファイル: add_codingjob.py プロジェクト: BBie/amcat
    def _run(self, job_size, articleset, name, project, **args):
        article_ids = articleset.articles.all().values_list("id", flat=True)
        job = self.bound_form.save(commit=False)
        
        if not job_size:
            job.articleset = ArticleSet.create_set(project=project, name=name, articles=article_ids, favourite=False)
            job.save()
            return job

        return create_codingjob_batches(job, article_ids, job_size)
コード例 #9
0
ファイル: add_codingjob.py プロジェクト: kasperwelbers/amcat
    def _run(self, job_size, articleset, name, project, **args):
        CreateSentences(dict(articlesets=[articleset.id])).run()
        job = self.bound_form.save(commit=False)
        
        if not job_size:
            job.articleset = ArticleSet.create_set(project=project, name=name, articles=articleset.articles.all())
            job.save()
            return job

        n = articleset.articles.count()
        result = []
        for i, start in enumerate(range(0, n, job_size)):
            job.pk = None
            job.articleset = ArticleSet.create_set(project=project, articles=articleset.articles.all()[start : start + job_size],
                                                   name="{name} - {j}".format(j=i+1, **locals()))
            job.name = "{name} - {j}".format(j=i+1, **locals())
            job.save()
            result.append(CodingJob.objects.get(pk=job.pk))
        return result
コード例 #10
0
    def _run(self, job_size, articleset, name, project, **args):
        article_ids = articleset.articles.all().values_list("id", flat=True)
        job = self.bound_form.save(commit=False)

        if not job_size:
            job.articleset = ArticleSet.create_set(project=project,
                                                   name=name,
                                                   articles=article_ids,
                                                   favourite=False)
            job.save()
            return job

        return create_codingjob_batches(job, article_ids, job_size)
コード例 #11
0
ファイル: deduplicate_set.py プロジェクト: PaulHuygen/amcat
    def hash_articles(cls, articleset: ArticleSet,
                      ignore_fields: set) -> Iterable[Tuple[int, str]]:
        """
        Finds all articles in an articleset, and hashes articles as a tuple of field values, ordered alphabetically
        by field name. Fields in ignore_fields will not affect the hash.
        Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the
        values of thoses fields are equal in both articles.

        @param articleset       The articleset that is to be searched
        @param ignore_fields    A set of fields that should not be included in the calculated hashes

        @return                 An iterable of (<article_id>, <hash>) tuples.
        """
        all_fields = STATIC_FIELDS + list(articleset.get_used_properties())

        if not ignore_fields:
            fields = ["hash"]
        else:
            fields = sorted(f for f in all_fields if not f in ignore_fields)

        x = amcates.ES().scan(query={
            "query": {
                "constant_score": {
                    "filter": {
                        "term": {
                            "sets": articleset.id
                        }
                    }
                }
            }
        },
                              fields=fields)
        for x in amcates.ES().scan(query={
                "query": {
                    "constant_score": {
                        "filter": {
                            "term": {
                                "sets": articleset.id
                            }
                        }
                    }
                }
        },
                                   fields=fields):
            if not ignore_fields:
                yield int(x['_id']), x['fields']['hash'][0]
                continue
            art_tuple = tuple(
                str(x['fields'].get(k, [None])[0]) for k in fields)
            hash = hash_class(repr(art_tuple).encode()).hexdigest()
            yield int(x['_id']), hash
コード例 #12
0
ファイル: codingjob.py プロジェクト: amcat/amcat
def _create_codingjob_batches(codingjob, article_ids, batch_size):
    name = codingjob.name

    for i, batch in enumerate(splitlist(article_ids, batch_size)):
        codingjob.pk = None
        codingjob.name = "{name} - {i}".format(i=i+1, name=name)
        codingjob.articleset = ArticleSet.create_set(
            project=codingjob.project,
            name=codingjob.name,
            favourite=False,
            articles=batch,
        )

        codingjob.save()
        yield codingjob.pk
コード例 #13
0
ファイル: article.py プロジェクト: kasperwelbers/amcat
def _save_articles(request, arts, project, cldata):
    """
    Save articles to database.

    @param arts: articles to save
    @param project: project to save articles to
    @param cldata: django cleaned formdata
    """
    for a in arts:
        a.project = project
        a.save()

    # Set logic
    if cldata['new_set']:
        nset = ArticleSet(name=cldata['new_set'], project=project)
        nset.save()
    elif cldata['exi_set']:
        nset = ArticleSet.objects.using(request.user.db).get(
            name=cldata['exi_set'].name, project=project)

    for a in arts:
        nset.articles.add(a)

    return nset, arts
コード例 #14
0
ファイル: article.py プロジェクト: kasperwelbers/amcat
def _save_articles(request, arts, project, cldata):
    """
    Save articles to database.

    @param arts: articles to save
    @param project: project to save articles to
    @param cldata: django cleaned formdata
    """
    for a in arts:
        a.project = project
        a.save()

    # Set logic
    if cldata['new_set']:
       nset = ArticleSet(name=cldata['new_set'], project=project)
       nset.save()
    elif cldata['exi_set']:
        nset = ArticleSet.objects.using(request.user.db).get(name=cldata['exi_set'].name,
                                                             project=project)

    for a in arts:
        nset.articles.add(a)

    return nset, arts
コード例 #15
0
ファイル: codingjob.py プロジェクト: isususi/amcat
def _create_codingjob_batches(codingjob, article_ids, batch_size):
    name = codingjob.name

    for i, batch in enumerate(splitlist(article_ids, batch_size)):
        codingjob.pk = None
        codingjob.name = "{name} - {i}".format(i=i + 1, name=name)
        codingjob.articleset = ArticleSet.create_set(
            project=codingjob.project,
            name=codingjob.name,
            favourite=False,
            articles=batch,
        )

        codingjob.save()
        yield codingjob.pk
コード例 #16
0
ファイル: deduplicate.py プロジェクト: BBie/amcat
    def _run(self, articleset, save_duplicates_to, dry_run, **kwargs):
        all_dupes = {}
        dupes_save_set = None
        log.debug("Deduplicating {articleset.id}".format(**locals()))
        for date in ES().list_dates(filters={"sets": articleset}):
            log.debug("Getting duplicates for {date}".format(**locals()))
            dupes = dict(self.get_duplicates(date))
            if dupes:
                all_dupes.update(dupes)
                todelete = list(itertools.chain(*dupes.values()))
                if not dry_run:
                    articleset.remove_articles(todelete)
                if save_duplicates_to:
                    if dupes_save_set is None:
                        dupes_save_set = ArticleSet.create_set(articleset.project, save_duplicates_to)
                    dupes_save_set.add_articles(todelete)

        log.debug("Deleted dupes for {} articles".format(len(all_dupes)))
        return all_dupes
コード例 #17
0
    def _run(self, articleset, save_duplicates_to, dry_run, **kwargs):
        all_dupes = {}
        dupes_save_set = None
        log.debug("Deduplicating {articleset.id}".format(**locals()))
        for date in ES().list_dates(filters={"sets": articleset}):
            log.debug("Getting duplicates for {date}".format(**locals()))
            dupes = dict(self.get_duplicates(date))
            if dupes:
                all_dupes.update(dupes)
                todelete = list(itertools.chain(*dupes.values()))
                if not dry_run:
                    articleset.remove_articles(todelete)
                if save_duplicates_to:
                    if dupes_save_set is None:
                        dupes_save_set = ArticleSet.create_set(
                            articleset.project, save_duplicates_to)
                    dupes_save_set.add_articles(todelete)

        log.debug("Deleted dupes for {} articles".format(len(all_dupes)))
        return all_dupes
コード例 #18
0
ファイル: article.py プロジェクト: kasperwelbers/amcat
def handle_split(form, project, article, sentences):
    if not form.is_valid():
        raise ValueError(
            "Non-valid form passed: {form.errors}".format(**locals()))

    articles = list(get_articles(article, sentences))

    # We won't use bulk_create yet, as it bypasses save() and doesn't
    # insert ids
    for art in articles:
        art.save()
        sbd.create_sentences(art)

    # Context variables for template
    form_data = form.cleaned_data
    all_sets = list(project.all_articlesets().filter(articles=article))

    # Keep a list of touched sets, so we can invalidate their indices
    dirty_sets = ArticleSet.objects.none()

    # Add splitted articles to existing sets
    ArticleSet.articles.through.objects.bulk_create([
        ArticleSet.articles.through(articleset=aset, article=art)
        for art in articles for aset in form_data["add_splitted_to_sets"]
    ])

    # Collect changed sets
    for field in ("add_splitted_to_sets", "remove_from_sets", "add_to_sets"):
        dirty_sets |= form_data[field]

    # Add splitted articles to sets wherin the original article live{d,s}
    if form_data["add_splitted_to_all"]:
        articlesetarts = ArticleSet.articles.through.objects.filter(
            article=article, articleset__project=project)

        ArticleSet.articles.through.objects.bulk_create([
            ArticleSet.articles.through(articleset=asetart.articleset,
                                        article=art) for art in articles
            for asetart in articlesetarts
        ])

        dirty_sets |= project.all_articlesets().filter(
            articles=article).only("id")

    if form_data["remove_from_sets"]:
        for aset in form_data["remove_from_sets"]:
            aset.remove_articles([article])

    if form_data["remove_from_all_sets"]:
        for aset in ArticleSet.objects.filter(project=project,
                                              articles=article).distinct():
            aset.remove_articles([article])

    if form_data["add_splitted_to_new_set"]:
        new_splitted_set = ArticleSet.create_set(
            project, form_data["add_splitted_to_new_set"], articles)

    if form_data["add_to_sets"]:
        for articleset in form_data["add_to_sets"]:
            articleset.add_articles([article])

    if form_data["add_to_new_set"]:
        new_set = ArticleSet.create_set(project, form_data["add_to_new_set"],
                                        [article])

    return locals()