def build_doc2vec_model(self) -> None: transformer = Doc2VecTransformer(vector_size=100, window=10, min_count=10, dm=1) transformer_name = '' self.project_ids = [self.project_id] if self.project_id else [] if not self.project_ids: self.project_ids = list( self.queryset.values_list('project_id', flat=True).distinct()) if not self.project_ids: error_msg = 'Document2VecFeatures has got no project_id and empty docs queryset' self.log_message(error_msg) raise RuntimeError(error_msg) if not self.queryset: self.queryset = \ TextUnitText.objects.filter(text_unit__unit_type=self.unit_type, text_unit__document__project_id__in=self.project_ids) model_builder_args = dict(project_ids=self.project_ids, transformer_name=transformer_name) model_builder = transformer.build_doc2vec_text_unit_model doc2vec, trans_obj = model_builder(**model_builder_args) self.transformer = trans_obj
def get_features(self) -> Features: """ Aggregator method to transform incoming queryset into features and indexes """ self.build_doc2vec_model() # type: gensim.models.doc2vec.Doc2Vec data = self.queryset.values_list('text_unit_id', 'text') vectors = Doc2VecTransformer.create_vectors( self.transformer, data, TextUnitVector, 'text_unit_id') # type: List[TextUnitVector] item_names = [] unqualified_item_ids = [] unqualified_item_names = [] for v in vectors: unit_name = f'[{v.text_unit.location_start}:{v.text_unit.location_end}]' item_names.append(unit_name) # feature names could be words instead of just "f0" ... by tagging documents # but this would require too much memory columns = ['id' ] + [f'f{i}' for i in range(len(vectors[0].vector_value))] vectors_indexed = [[v.text_unit.pk] + list(v.vector_value) for v in vectors] feature_df = pd.DataFrame(vectors_indexed, columns=columns) feature_df.set_index('id', inplace=True) res = Features(feature_df, item_names, unqualified_item_ids, unqualified_item_names) return res
def build_doc2vec_model(self) -> None: transformer = Doc2VecTransformer(vector_size=100, window=10, min_count=10, dm=1) transformer_name = '' self.project_ids = [self.project_id] \ if self.project_id and isinstance(self.project_id, int) else self.project_id or [] if not self.project_ids: self.project_ids = list( self.queryset.values_list('project_id', flat=True).distinct()) if not self.project_ids: error_msg = 'Document2VecFeatures has got no project_id and empty docs queryset' self.log_message(error_msg) raise RuntimeError(error_msg) if not self.queryset: self.queryset = Document.objects.filter( project_id__in=self.project_ids) model_builder_args = dict(project_ids=self.project_ids, transformer_name=transformer_name) model_builder = transformer.build_doc2vec_document_model # source == 'document': doc2vec, trans_obj = model_builder(**model_builder_args) self.transformer = trans_obj
def get_features(self) -> Features: """ Aggregator method to transform incoming queryset into features and indexes """ self.build_doc2vec_model() # type: gensim.models.doc2vec.Doc2Vec data = DocumentText.objects.filter( document__project_id__in=self.project_ids).values_list( 'document_id', 'full_text') vectors = Doc2VecTransformer.create_vectors( self.transformer, data, DocumentVector, 'document_id') # type: List[DocumentVector] item_names = [] unqualified_item_ids = [] unqualified_item_names = [] for v in vectors: item_names.append(v.document.name) # feature names could be words instead of just "f0" ... by tagging documents # but this would require too much memory columns = ['id' ] + [f'f{i}' for i in range(len(vectors[0].vector_value))] vectors_indexed = [[v.document.pk] + list(v.vector_value) for v in vectors] feature_df = pd.DataFrame(vectors_indexed, columns=columns) feature_df.set_index('id', inplace=True) res = Features(feature_df, item_names, unqualified_item_ids, unqualified_item_names) return res
def process(self, **kwargs): source = kwargs.get('source') self.log_info('Going to train doc2vec model from {} objects...'.format( source.upper())) transformer_name = kwargs.get('transformer_name') project_ids = kwargs.get('project_ids') vector_size = kwargs.get('vector_size') window = kwargs.get('window') min_count = kwargs.get('min_count') dm = kwargs.get('dm') transformer = Doc2VecTransformer(vector_size=vector_size, window=window, min_count=min_count, dm=dm) model_builder_args = dict(project_ids=project_ids, transformer_name=transformer_name) if source == 'document': model_builder = transformer.build_doc2vec_document_model else: model_builder = transformer.build_doc2vec_text_unit_model model_builder_args['text_unit_type'] = kwargs.get('text_unit_type') model_builder(**model_builder_args)
def get_vectors(self) -> List[DocumentVector]: qs = self.get_queryset() # type: Document.objects if self.feature_source == 'vector': docs_wo_vectors = qs.exclude(documentvector__transformer=self.transformer) if docs_wo_vectors.exists(): data = DocumentText.objects \ .filter(document__in=docs_wo_vectors) \ .values_list('document_id', 'full_text') Doc2VecTransformer.create_vectors( self.transformer, data, DocumentVector, 'document_id', save=True) return list(DocumentVector.objects.filter(document__in=qs, transformer=self.transformer)) # self.feature_source == 'text' transformer = self.build_doc2vec_model() data = self.get_document_data(qs) return Doc2VecTransformer.create_vectors(transformer, data, DocumentVector, 'document_id')
def get_vectors(self) -> List[TextUnitVector]: qs = self.get_queryset() # type: TextUnit.objects if self.feature_source == 'vector': # this hangs forever # tu_wo_vectors = qs.exclude(textunitvector__transformer=self.transformer) tu_with_vectors = TextUnitVector.objects \ .filter(transformer=self.transformer, text_unit__in=qs) \ .values_list('text_unit_id', flat=True) tu_wo_vectors = qs.exclude(id__in=tu_with_vectors) if tu_wo_vectors.exists(): data = TextUnitText.objects \ .filter(text_unit__in=tu_wo_vectors) \ .values_list('text_unit_id', 'text') Doc2VecTransformer.create_vectors( self.transformer, data, TextUnitVector, 'text_unit_id', save=True) return list(TextUnitVector.objects.filter(text_unit__in=qs)) # self.feature_source == 'text' transformer = self.build_doc2vec_model() data = TextUnitText.objects.filter(text_unit__in=qs).values_list('text_unit_id', 'text') return Doc2VecTransformer.create_vectors(transformer, data, TextUnitVector, 'text_unit_id')
def save_feature_vectors(self): model_class = DocumentTransformer if self.source == 'document' else TextUnitTransformer vector_class = DocumentVector if self.source == 'document' else TextUnitVector transformer = (self.transformer if hasattr(self, 'transformer') else None) or \ model_class.objects.get(pk=self.model_id) id_field = 'document_id' if self.source == 'document' else 'text_unit_id' if self.source == 'document': data_query = DocumentText.objects.all() if self.project_ids: data_query = data_query.filter( document__project_id__in=self.project_ids) data = data_query.values_list(id_field, 'full_text') else: data_query = TextUnitText.objects.filter( text_unit__unit_type=self.text_unit_type) if self.project_ids: data_query = data_query.filter( text_unit__document__project_id__in=self.project_ids) data = data_query.values_list(id_field, 'text') if self.delete_existing: data_ids = data_query.values_list(id_field, flat=True) delete_query = vector_class.objects.all() if self.source == 'document': delete_query = delete_query.filter(document_id__in=data_ids) else: delete_query = delete_query.filter(text_unit_id__in=data_ids) try: delete_query.delete() except Exception as e: self.log_error('Error deleting existing records', exc_info=e) raise vectors = Doc2VecTransformer.create_vectors( transformer, data, vector_class, id_field) # type: List[BaseVector] # save vectors self.log_info(f'Saving {len(vectors)} vectors') if not vectors: return try: vector_class.objects.bulk_create(vectors, ignore_conflicts=True) except Exception as e: self.log_error(f'Error storing {vector_class.__name__}', exc_info=e) raise
def process(self, **kwargs): self.source = kwargs.get('source') transformer_class = DocumentTransformer if self.source == 'document' else TextUnitTransformer self.log_info( f'Training doc2vec model from {self.source.upper()} objects...') transformer_name = kwargs.get('transformer_name') if transformer_class.objects.filter(name=transformer_name).count() > 0: raise RuntimeError( f"There's already {transformer_class.__name__} with name '{transformer_name}'" ) self.project_ids = kwargs.get('project_ids') vector_size = kwargs.get('vector_size') window = kwargs.get('window') min_count = kwargs.get('min_count') dm = kwargs.get('dm') build_vectors = kwargs.get('build_vectors') self.text_unit_type = kwargs.get( 'text_unit_type') or self.text_unit_type transformer = Doc2VecTransformer(vector_size=vector_size, window=window, min_count=min_count, dm=dm) model_builder_args = dict(project_ids=self.project_ids, transformer_name=transformer_name) if self.source == 'document': model_builder = transformer.build_doc2vec_document_model else: model_builder = transformer.build_doc2vec_text_unit_model model_builder_args['text_unit_type'] = self.text_unit_type _, transformer = model_builder( **model_builder_args ) # gensim.models.doc2vec.Doc2Vec, BaseTransformer self.transformer = transformer if build_vectors: self.save_feature_vectors()
def build_doc2vec_model(self) -> MLModel: transformer = Doc2VecTransformer(vector_size=100, window=10, min_count=10, dm=1) qs = self.get_queryset() # type: TextUnit.objects _, trans_obj = transformer.build_doc2vec_text_unit_model(text_unit_qs=qs) return trans_obj
def build_doc2vec_model(self) -> MLModel: transformer = Doc2VecTransformer(vector_size=100, window=10, min_count=10, dm=1, file_storage=self.file_storage) qs = self.get_queryset() # type: Document.objects _, trans_obj = transformer.build_doc2vec_document_model(document_qs=qs) return trans_obj