コード例 #1
0
 def store_unit_similarity_issues(self,
                                  un_sims: List[TextUnitSimilarity],
                                  flush: bool = False) -> None:
     """
     Store TextUnitSimilarity objects in buffer for future saving them
     in bulk update / insert operation
     :param un_sims: items to store
     :param flush: flush buffer
     """
     if un_sims:
         self.unsim_store_buffer += un_sims
     if len(self.unsim_store_buffer
            ) < self.store_buf_flush_count and not flush:
         return
     # unit -> document -> project
     if self.unsim_store_buffer:
         TextUnitSimilarity.fill_joined_refs(self.unsim_store_buffer)
         TextUnitSimilarity.objects.bulk_create(self.unsim_store_buffer,
                                                ignore_conflicts=True)
     self.unsim_store_buffer = []
コード例 #2
0
    def process(self, **kwargs):
        """

        :param kwargs:
        :return:
        """

        search_similar_documents = kwargs['search_similar_documents']
        search_similar_text_units = kwargs['search_similar_text_units']
        similarity_threshold = kwargs['similarity_threshold']
        project = kwargs.get('project')
        project_id = project['pk'] if project else 0
        self.log_info('Min similarity: %d' % similarity_threshold)

        # get text units with min length 100 signs
        filters = dict(unit_type='paragraph', textunittext__text__regex=r'.{100}.*')
        if project_id:
            filters['project_id'] = project_id
        text_units = TextUnit.objects.filter(**filters)
        len_tu_set = text_units.count()

        push_steps = 0
        if search_similar_documents:
            push_steps += 4
        if search_similar_text_units:
            push_steps += math.ceil(len_tu_set / self.step) ** 2 + 3
        self.set_push_steps(push_steps)

        documents = Document.objects.filter(project_id=project_id) if project_id \
            else Document.objects.all()

        # similar Documents
        if search_similar_documents:

            # step #1 - delete
            if kwargs['delete']:
                DocumentSimilarity.objects.all().delete()
            self.push()

            # step #2 - prepare data
            texts_set = ['\n'.join(d.textunit_set.values_list('textunittext__text', flat=True))
                         for d in documents]
            self.push()

            # step #3
            vectorizer = TfidfVectorizer(max_df=0.5, max_features=self.n_features,
                                         min_df=2, stop_words='english',
                                         use_idf=kwargs['use_idf'])
            X = vectorizer.fit_transform(texts_set)
            self.push()

            # step #4
            similarity_matrix = cosine_similarity(X) * 100
            pks = documents.values_list('pk', flat=True)
            for x in range(len(pks) - 1):
                document_a = pks[x]
                # use it to search for unique a<>b relations
                # for y, document_b in enumerate(Document.objects.all()[x + 1:], start=x + 1):
                for y in range(x + 1, len(pks)):
                    document_b = pks[y]
                    similarity = similarity_matrix[x, y]
                    if similarity < similarity_threshold:
                        continue
                    DocumentSimilarity.objects.create(
                        document_a_id=document_a,
                        document_b_id=document_b,
                        similarity=similarity)
            self.push()

        # similar Text Units
        if search_similar_text_units:

            # step #1 - delete
            if kwargs['delete']:
                TextUnitSimilarity.objects.all().delete()
            self.push()

            # step #2 - prepare data
            texts_set, pks = zip(*text_units.values_list('textunittext__text', 'pk'))
            self.push()

            # step #3
            vectorizer = TfidfVectorizer(tokenizer=normalize,
                                         max_df=0.5, max_features=self.n_features,
                                         min_df=2, stop_words='english',
                                         use_idf=kwargs['use_idf'])
            X = vectorizer.fit_transform(texts_set)
            self.push()

            # step #4
            for i in range(0, len_tu_set, self.step):
                for j in range(i + 1, len_tu_set, self.step):
                    similarity_matrix = cosine_similarity(
                        X[i:min([i + self.step, len_tu_set])],
                        X[j:min([j + self.step, len_tu_set])]) * 100
                    for g in range(similarity_matrix.shape[0]):
                        tu_sim = [
                            TextUnitSimilarity(
                                text_unit_a_id=pks[i + g],
                                text_unit_b_id=pks[j + h],
                                similarity=similarity_matrix[g, h])
                            for h in range(similarity_matrix.shape[1])
                            if i + g != j + h and similarity_matrix[g, h] >= similarity_threshold]
                        TextUnitSimilarity.fill_joined_refs(tu_sim)
                        TextUnitSimilarity.objects.bulk_create(tu_sim)
                    self.push()