Esempio n. 1
0
    def test_native_simple_repeatability(self):
        """ Generate hashes of a 3 word text with sherlock directly """

        text = PlagcheckTestData.get_text("simple/hello_world.txt")
        hashes1 = sherlock.signature_str(text)
        hashes2 = sherlock.signature_str(text)
        self.assertSequenceEqual(hashes1, hashes2)
Esempio n. 2
0
    def test_native_simple_repeatability(self):
        """ Generate hashes of a 3 word text with sherlock directly """

        text = PlagcheckTestData.get_text("simple/hello_world.txt")
        hashes1 = sherlock.signature_str(text)
        hashes2 = sherlock.signature_str(text)
        self.assertSequenceEqual(hashes1, hashes2)
Esempio n. 3
0
 def test_sherlock_mod(self):
     """ Check if modification to sherlock didn't affect results """
     hashes1 = sherlock.signature_str(
         PlagcheckTestData.get_text("simple/hello_world.txt"))
     hashes2 = sherlock.signature(
         PlagcheckTestData.get_text_path("simple/hello_world.txt"))
     self.assertSequenceEqual(hashes1, hashes2)
Esempio n. 4
0
def check(self, **kwargs):
    """
    Do a check of a document against the hashed documents in the db.

    Call this function directly to run the check synchronously:

        tasks.check(args)

    or call this function asynchronously like:

        tasks.check.delay(args)

    to schedule a check at the celery worker process using a RabbitMQ message.

    The following are all required kwargs parameters that have to be passed.

    :param doc_id -- ID of the document within the Document store

    :return: Future object of this task invocation when called asynchronously,
    or the result if called synchronously.
    """
    try:
        suspect_doc = Document.objects.get(pk=kwargs['doc_id'])

        # delete existing references to older versions of this document
        Reference.remove_references(suspect_doc.id)

        # generate a list of hashes
        hash_list = sherlock.signature_str(suspect_doc.text)

        # remove duplicate hashes from the list
        hash_set = set(hash_list)

        hash_count = len(hash_set)

        # check if hashes are generated which means punctuations found
        suspicions = list()
        if hash_count > 0:
            # store (new) references
            Reference.store_references(suspect_doc.id, hash_set)

            # compute individual similarity by computing the count
            # of matches for each other document. This returns a list
            # of possible matching documents.
            similar_elaborations = Reference.get_similar_elaborations(suspect_doc.id)
            for (similar_doc_id, match_count) in similar_elaborations:
                similarity = round((100.0/hash_count) * match_count, 4)

                if similarity > 100:
                    raise PlagcheckError(
                        'computed similarity is greated than 100% ({0}). doc_id={5}, '
                        'similar_doc_id={1}, hash_count={4}, match_count={2}'
                        .format(similarity, similar_doc_id, match_count, hash_count, suspect_doc.id)
                    )

                # put them in a list so that filtered
                # findings can be handled later
                suspicions.append({
                    'similar_doc_id': similar_doc_id,
                    'similarity': similarity,
                    'match_count': match_count
                })

        result = Result.objects.create(
            hash_count=hash_count,
            doc_id=suspect_doc.id,
            submission_time=suspect_doc.submission_time.isoformat(),
        )

        for suspicion_item in suspicions:
            suspicion = Suspicion(
                suspect_doc_id=suspect_doc.id,
                similar_doc_id=suspicion_item['similar_doc_id'],
                similarity=suspicion_item['similarity'],
                match_count=suspicion_item['match_count'],
                result=result,
                state=Suspicion.DEFAULT_STATE.value
            )

            (suspicion_state, reason) = filter_suspicion(suspicion, suspicion_filters)

            if suspicion_state is not None:
                suspicion.state = suspicion_state.value
                suspicion.save()

        return result.celery_result()
    except OperationalError as e:
        print("Got an OperationalError, retrying")
        self.retry(exc=e, max_retries=2, countdown=5)
Esempio n. 5
0
 def test_sherlock_mod(self):
     """ Check if modification to sherlock didn't affect results """
     hashes1 = sherlock.signature_str(PlagcheckTestData.get_text("simple/hello_world.txt"))
     hashes2 = sherlock.signature(PlagcheckTestData.get_text_path("simple/hello_world.txt"))
     self.assertSequenceEqual(hashes1, hashes2)