class PlagiarismDetector(BaseService):
    plag_dao: PlagiarismDAO = inject(PlagiarismDAO)
    vectorizer = None

    @staticmethod
    def tokenize_and_stem(doc):
        """
        Splits the document in tokens and then perform stemming.
        :param doc:
        :return:
        """
        punctuation_remover = dict(
            (ord(char), None) for char in string.punctuation)
        tokens = nltk.word_tokenize(doc.lower().translate(punctuation_remover))
        return PlagiarismDetector.stem_tokens(tokens)

    @staticmethod
    def stem_tokens(tokens):
        """
        Stems the tokenized document.
        :param tokens:
        :return:
        """
        stemmer = nltk.stem.porter.PorterStemmer()
        stemmed_tokens = [stemmer.stem(item) for item in tokens]
        return stemmed_tokens

    def cosine_similarity(self, source_doc, input_doc):
        """
        Computes the similarity score for `input_doc` by matching it against `source_doc`
        using `TF-IDF` & `Cosine-similarity`

        :param source_doc:
        :param input_doc:
        :return:
        """
        vectorizer = self.vectorizer or TfidfVectorizer(
            tokenizer=PlagiarismDetector.tokenize_and_stem,
            stop_words='english')
        tfidf = vectorizer.fit_transform([source_doc, input_doc])
        return ((tfidf * tfidf.T).A)[0, 1]

    def compute_similarity(self, input_doc) -> Dict:
        """
        Returns a dict containing highest possible similarity score and the most similar doc.
        :param input_doc:
        :return:
        """
        most_similar_so_far = dict(similarity_score=-1, doc=None)

        for doc_info in self.plag_dao.yield_docs():
            docs: List[Document] = doc_info['data']

            for doc in docs:
                similarity_score = self.cosine_similarity(
                    doc.content, input_doc)
                if similarity_score > most_similar_so_far['similarity_score']:
                    most_similar_so_far['similarity_score'] = similarity_score
                    most_similar_so_far['doc'] = doc
        return most_similar_so_far
class PlagiarismDetection(BaseController):
    plag_detector: PlagiarismDetector = inject(PlagiarismDetector)

    @intercept()
    def post(self, *args, **kwargs):
        """Detects plagiarism"""

        data = request.get_json(force=True)
        input_doc = data.get('text', None)
        if input_doc is None:
            ExceptionBuilder(BadRequest).error(HttpErrorCode.REQUIRED_FIELD, 'text').throw()
        most_similar_doc_info: Dict = self.plag_detector.compute_similarity(input_doc)

        most_similar_doc = most_similar_doc_info['doc']
        similarity_score = most_similar_doc_info['similarity_score']
        similarity_percentage = round(similarity_score * 100, 2)

        message = "Input text is {}% similar to the doc `{}` with similarity score of {}".format(
            similarity_percentage, most_similar_doc.title, similarity_score
        )

        res_data = {
            'similarity_score': similarity_score,
            'similarity_percentage': similarity_percentage,
            'doc': most_similar_doc.to_dict()
        }

        return Response(status_code=200, message=message, data=res_data)
class Document(BaseController):
    plag_dao: PlagiarismDAO = inject(PlagiarismDAO)
    elasticsearhobj = elasticsearch.ElasticSearchFunction()

    @intercept()
    def post(self, *args, **kwargs):
        """Adds a new document to repo"""
        data = request.get_json(force=True)

        content = data.get('content', '')
        title = data.get('title', '')
        description = data.get('description', '')
        author = data.get('author', '')

        if content and title:
            # Se agrega el documento en la BD
            doc = self.plag_dao.create_doc(content,
                                           title,
                                           description=description,
                                           author=author)
            #Se agrega el documento al índice en elasticsearh
            self.elasticsearhobj.add(doc.to_dict_es())
        else:
            ExceptionBuilder(BadRequest).error(HttpErrorCode.REQUIRED_FIELD,
                                               'content', 'title').throw()

        return Response(status_code=201,
                        message='Document added successfully!')

    @intercept()
    def get(self):
        """
        Fetches all the documents(paginated).
        :return:
        """
        res = self.plag_dao.get_docs(page=int(request.args.get("page", 1)),
                                     per_page=int(
                                         request.args.get("per_page", 10)),
                                     all='all' in request.args)
        docs_info = dict(data=[d.to_dict() for d in res['data']],
                         count=res['count'])
        print(docs_info)
        return Response(data=docs_info)
class BaseController(Resource):
    """
    Every controller must extend this class.
    """

    # Child controllers must override this property with default service for a specific module.
    __service_class__ = inject('service.base.BaseService')

    @property
    def service(self):
        """
        Returns an instance of service class(as defined under `__service_class__`. i.e. default service) to be used
        inside a controller. Usage inside controller's action methods::
            service_obj = self.service
        """
        if self.__service_class__ is None:
            raise NotImplementedError(
                "Controller {} must override '__service_class__' property".format(self.__class__.__name__))
        return self.__service_class__.inject if isinstance(self.__service_class__,
                                                           (Injectable)) else self.__service_class__
 def setUp(self):
     self.plag_dao: PlagiarismDAO = inject(PlagiarismDAO)
     self.plag_detector: PlagiarismDetector = inject(PlagiarismDetector)
     self.host = 'http://0.0.0.0:5000'
Beispiel #6
0
class PlagiarismDetection(BaseController):
    plag_detector: PlagiarismDetector = inject(PlagiarismDetector)
    elasticsearhobj = elasticsearch.ElasticSearchFunction()
    functions_plag_obj = functions_plag.FunctionsPlagiarism()

    @intercept()
    def post(self, *args, **kwargs):
        """Detects plagiarism"""

        #response_skl = []
        response_es = []
        highlight_response = []
        my_uncommon_response = []
        data = request.get_json(force=True)
        input_doc = data.get('text', None)
        if input_doc is None:
            ExceptionBuilder(BadRequest).error(HttpErrorCode.REQUIRED_FIELD,
                                               'text').throw()

        # Se divide en párrafos el texto recibido
        token_text = sent_tokenize(input_doc)
        for paragraph_text in token_text:
            # Se detecta similitud haciendo uso de ElasticSearh
            responseES = self.elasticsearhobj.searchByContent(paragraph_text)
            # Se evalua cada párrafo retornado
            for highlight in responseES['hits']['hits'][0]['highlight'][
                    'content']:
                parag_text_clean = self.functions_plag_obj.getStringClean(
                    paragraph_text)
                highlight_clean = self.functions_plag_obj.getStringClean(
                    highlight)
                uncommon_words = list(
                    self.functions_plag_obj.getUncommonWords(
                        parag_text_clean, highlight_clean))
                my_uncommon_words = self.functions_plag_obj.getHighlightPerformance(
                    uncommon_words, parag_text_clean)

                res_highlight_data = {
                    'content':
                    highlight,
                    'levenshtein_distance':
                    self.functions_plag_obj.getLevenshteinDistance(
                        parag_text_clean, highlight_clean),
                    'similatiry_difflib':
                    self.functions_plag_obj.getRatioSequenceMatcher(
                        parag_text_clean, highlight_clean),
                    'uncommon_words':
                    uncommon_words,
                    'my_uncommon_words':
                    my_uncommon_words
                }
                highlight_response.append(res_highlight_data)

            # Se arma la respuesta a entregar en API
            res_es_data = {
                'paragraph_text': paragraph_text,
                'similarity_score': responseES['hits']['hits'][0]['_score'],
                'similarity_percentage':
                responseES['hits']['hits'][0]['_score'],
                'doc_': responseES['hits']['hits'][0]['_source'],
                'highlight': highlight_response
            }
            response_es.append(res_es_data)

        # Respuesta final entregada en el POST
        super_res_data = {'response_elastic': response_es}
        return Response(status_code=200,
                        message='Return info match',
                        data=super_res_data)