Example #1
0
def create_sample(extraction_result, sample_id, job_id, url,
        source_type, source_val='', domain='', label=None, silent=False,
        vote_sample=True, btm_sample=False, training=True, *args, **kwargs):
    """
    If error while capturing web propagate it. Finally deletes TemporarySample.
    extraction_result should be [True, True] - otherwise chaining failed.
    """

    extracted = all([x is True for x in extraction_result])

    job = Job.objects.get(id=job_id)
    # Checking if all previous tasks succeeded.
    if extracted:

        # Proper sample entry
        Sample.objects.filter(id=sample_id).update(
            source_type=source_type,
            source_val=source_val,
            domain=domain,
            vote_sample=vote_sample,
            btm_sample=btm_sample,
            training=training,
        )
        sample = Sample.objects.get(id=sample_id)
        if not silent:
            # Golden sample
            if label is not None:
                # GoldSample created sucesfully - pushing event.
                gold = GoldSample(
                    sample=sample,
                    label=label
                )
                gold.save()
                send_event(
                    "EventNewGoldSample",
                    job_id=job.id,
                    gold_id=gold.id,
                )

            # Ordinary sample
            else:
                # Sample created sucesfully - pushing event.
                send_event(
                    "EventNewBTMSample" if btm_sample else "EventNewSample",
                    job_id=job.id,
                    sample_id=sample_id,
                )
    else:
        # Extraction failed, cleanup.
        Sample.objects.filter(id=sample_id).delete()
        if label is not None:
            Job.objects.filter(id=job.id, gold_left__gte=0)\
                .update(gold_left=F('gold_left') - 1)

    return (extracted, sample_id)
Example #2
0
def copy_sample_to_job(sample_id, job_id, source_type, label='', source_val='',
        btm_sample=False, *args, **kwargs):
    try:
        old_sample = Sample.objects.get(id=sample_id)
        job = Job.objects.get(id=job_id)

        vote_sample = False if btm_sample else True
        training = False if btm_sample else True

        new_sample = Sample.objects.create(
            job=job,
            url=old_sample.url,
            text=old_sample.text,
            screenshot=old_sample.screenshot,
            source_type=source_type,
            source_val=source_val,
            btm_sample=btm_sample,
            vote_sample=vote_sample,
            training=training,
        )

        send_event(
            "EventSampleScreenshotDone",
            sample_id=new_sample.id,
            sample_url=new_sample.url,
            job_id=new_sample.job_id,
        )
        send_event(
            "EventSampleContentDone",
            sample_id=new_sample.id,
            sample_url=new_sample.url,
            job_id=new_sample.job_id,
        )

        # Golden sample
        if label is not None:
            # GoldSample created sucesfully - pushing event.
            gold = GoldSample(
                sample=new_sample,
                label=label
            )
            gold.save()
            send_event(
                "EventNewGoldSample",
                job_id=job.id,
                gold_id=gold.id,
            )

        # Ordinary sample
        else:
            # Sample created sucesfully - pushing event.
            send_event(
                "EventNewBTMSample" if btm_sample else "EventNewSample",
                job_id=job.id,
                sample_id=new_sample.id,
            )

    except IntegrityError:
        # Such sample has been created in the mean time, dont do anything
        return Sample.objects.get(job=job, url=old_sample.url).id
    except DatabaseError, e:
        # Retry process on db error, such as 'Database is locked'
        copy_sample_to_job.retry(exc=e,
            countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24))