def store_unit_similarity_issues(self, un_sims: List[TextUnitSimilarity], flush: bool = False) -> None: """ Store TextUnitSimilarity objects in buffer for future saving them in bulk update / insert operation :param un_sims: items to store :param flush: flush buffer """ if un_sims: self.unsim_store_buffer += un_sims if len(self.unsim_store_buffer ) < self.store_buf_flush_count and not flush: return # unit -> document -> project if self.unsim_store_buffer: TextUnitSimilarity.fill_joined_refs(self.unsim_store_buffer) TextUnitSimilarity.objects.bulk_create(self.unsim_store_buffer, ignore_conflicts=True) self.unsim_store_buffer = []
def process(self, **kwargs): """ :param kwargs: :return: """ search_similar_documents = kwargs['search_similar_documents'] search_similar_text_units = kwargs['search_similar_text_units'] similarity_threshold = kwargs['similarity_threshold'] project = kwargs.get('project') project_id = project['pk'] if project else 0 self.log_info('Min similarity: %d' % similarity_threshold) # get text units with min length 100 signs filters = dict(unit_type='paragraph', textunittext__text__regex=r'.{100}.*') if project_id: filters['project_id'] = project_id text_units = TextUnit.objects.filter(**filters) len_tu_set = text_units.count() push_steps = 0 if search_similar_documents: push_steps += 4 if search_similar_text_units: push_steps += math.ceil(len_tu_set / self.step) ** 2 + 3 self.set_push_steps(push_steps) documents = Document.objects.filter(project_id=project_id) if project_id \ else Document.objects.all() # similar Documents if search_similar_documents: # step #1 - delete if kwargs['delete']: DocumentSimilarity.objects.all().delete() self.push() # step #2 - prepare data texts_set = ['\n'.join(d.textunit_set.values_list('textunittext__text', flat=True)) for d in documents] self.push() # step #3 vectorizer = TfidfVectorizer(max_df=0.5, max_features=self.n_features, min_df=2, stop_words='english', use_idf=kwargs['use_idf']) X = vectorizer.fit_transform(texts_set) self.push() # step #4 similarity_matrix = cosine_similarity(X) * 100 pks = documents.values_list('pk', flat=True) for x in range(len(pks) - 1): document_a = pks[x] # use it to search for unique a<>b relations # for y, document_b in enumerate(Document.objects.all()[x + 1:], start=x + 1): for y in range(x + 1, len(pks)): document_b = pks[y] similarity = similarity_matrix[x, y] if similarity < similarity_threshold: continue DocumentSimilarity.objects.create( document_a_id=document_a, document_b_id=document_b, similarity=similarity) self.push() # similar Text Units if search_similar_text_units: # step #1 - delete if kwargs['delete']: TextUnitSimilarity.objects.all().delete() self.push() # step #2 - prepare data texts_set, pks = zip(*text_units.values_list('textunittext__text', 'pk')) self.push() # step #3 vectorizer = TfidfVectorizer(tokenizer=normalize, max_df=0.5, max_features=self.n_features, min_df=2, stop_words='english', use_idf=kwargs['use_idf']) X = vectorizer.fit_transform(texts_set) self.push() # step #4 for i in range(0, len_tu_set, self.step): for j in range(i + 1, len_tu_set, self.step): similarity_matrix = cosine_similarity( X[i:min([i + self.step, len_tu_set])], X[j:min([j + self.step, len_tu_set])]) * 100 for g in range(similarity_matrix.shape[0]): tu_sim = [ TextUnitSimilarity( text_unit_a_id=pks[i + g], text_unit_b_id=pks[j + h], similarity=similarity_matrix[g, h]) for h in range(similarity_matrix.shape[1]) if i + g != j + h and similarity_matrix[g, h] >= similarity_threshold] TextUnitSimilarity.fill_joined_refs(tu_sim) TextUnitSimilarity.objects.bulk_create(tu_sim) self.push()