Beispiel #1
0
    def build_doc2vec_model(self) -> None:
        transformer = Doc2VecTransformer(vector_size=100,
                                         window=10,
                                         min_count=10,
                                         dm=1)
        transformer_name = ''
        self.project_ids = [self.project_id] if self.project_id else []
        if not self.project_ids:
            self.project_ids = list(
                self.queryset.values_list('project_id', flat=True).distinct())
            if not self.project_ids:
                error_msg = 'Document2VecFeatures has got no project_id and empty docs queryset'
                self.log_message(error_msg)
                raise RuntimeError(error_msg)

        if not self.queryset:
            self.queryset = \
                TextUnitText.objects.filter(text_unit__unit_type=self.unit_type,
                                            text_unit__document__project_id__in=self.project_ids)

        model_builder_args = dict(project_ids=self.project_ids,
                                  transformer_name=transformer_name)
        model_builder = transformer.build_doc2vec_text_unit_model
        doc2vec, trans_obj = model_builder(**model_builder_args)
        self.transformer = trans_obj
Beispiel #2
0
    def get_features(self) -> Features:
        """
        Aggregator method to transform incoming queryset into features and indexes
        """
        self.build_doc2vec_model()  # type: gensim.models.doc2vec.Doc2Vec
        data = self.queryset.values_list('text_unit_id', 'text')
        vectors = Doc2VecTransformer.create_vectors(
            self.transformer, data, TextUnitVector,
            'text_unit_id')  # type: List[TextUnitVector]

        item_names = []
        unqualified_item_ids = []
        unqualified_item_names = []
        for v in vectors:
            unit_name = f'[{v.text_unit.location_start}:{v.text_unit.location_end}]'
            item_names.append(unit_name)

        # feature names could be words instead of just "f0" ... by tagging documents
        # but this would require too much memory
        columns = ['id'
                   ] + [f'f{i}' for i in range(len(vectors[0].vector_value))]
        vectors_indexed = [[v.text_unit.pk] + list(v.vector_value)
                           for v in vectors]
        feature_df = pd.DataFrame(vectors_indexed, columns=columns)
        feature_df.set_index('id', inplace=True)

        res = Features(feature_df, item_names, unqualified_item_ids,
                       unqualified_item_names)

        return res
Beispiel #3
0
    def build_doc2vec_model(self) -> None:
        transformer = Doc2VecTransformer(vector_size=100,
                                         window=10,
                                         min_count=10,
                                         dm=1)
        transformer_name = ''
        self.project_ids = [self.project_id] \
            if self.project_id and isinstance(self.project_id, int) else self.project_id or []
        if not self.project_ids:
            self.project_ids = list(
                self.queryset.values_list('project_id', flat=True).distinct())
            if not self.project_ids:
                error_msg = 'Document2VecFeatures has got no project_id and empty docs queryset'
                self.log_message(error_msg)
                raise RuntimeError(error_msg)

        if not self.queryset:
            self.queryset = Document.objects.filter(
                project_id__in=self.project_ids)

        model_builder_args = dict(project_ids=self.project_ids,
                                  transformer_name=transformer_name)
        model_builder = transformer.build_doc2vec_document_model  # source == 'document':
        doc2vec, trans_obj = model_builder(**model_builder_args)
        self.transformer = trans_obj
Beispiel #4
0
    def get_features(self) -> Features:
        """
        Aggregator method to transform incoming queryset into features and indexes
        """
        self.build_doc2vec_model()  # type: gensim.models.doc2vec.Doc2Vec
        data = DocumentText.objects.filter(
            document__project_id__in=self.project_ids).values_list(
                'document_id', 'full_text')
        vectors = Doc2VecTransformer.create_vectors(
            self.transformer, data, DocumentVector,
            'document_id')  # type: List[DocumentVector]

        item_names = []
        unqualified_item_ids = []
        unqualified_item_names = []
        for v in vectors:
            item_names.append(v.document.name)

        # feature names could be words instead of just "f0" ... by tagging documents
        # but this would require too much memory
        columns = ['id'
                   ] + [f'f{i}' for i in range(len(vectors[0].vector_value))]
        vectors_indexed = [[v.document.pk] + list(v.vector_value)
                           for v in vectors]
        feature_df = pd.DataFrame(vectors_indexed, columns=columns)
        feature_df.set_index('id', inplace=True)

        res = Features(feature_df, item_names, unqualified_item_ids,
                       unqualified_item_names)

        return res
Beispiel #5
0
    def process(self, **kwargs):
        source = kwargs.get('source')

        self.log_info('Going to train doc2vec model from {} objects...'.format(
            source.upper()))

        transformer_name = kwargs.get('transformer_name')
        project_ids = kwargs.get('project_ids')

        vector_size = kwargs.get('vector_size')
        window = kwargs.get('window')
        min_count = kwargs.get('min_count')
        dm = kwargs.get('dm')

        transformer = Doc2VecTransformer(vector_size=vector_size,
                                         window=window,
                                         min_count=min_count,
                                         dm=dm)

        model_builder_args = dict(project_ids=project_ids,
                                  transformer_name=transformer_name)
        if source == 'document':
            model_builder = transformer.build_doc2vec_document_model
        else:
            model_builder = transformer.build_doc2vec_text_unit_model
            model_builder_args['text_unit_type'] = kwargs.get('text_unit_type')

        model_builder(**model_builder_args)
Beispiel #6
0
    def get_vectors(self) -> List[DocumentVector]:
        qs = self.get_queryset()    # type: Document.objects

        if self.feature_source == 'vector':
            docs_wo_vectors = qs.exclude(documentvector__transformer=self.transformer)
            if docs_wo_vectors.exists():
                data = DocumentText.objects \
                    .filter(document__in=docs_wo_vectors) \
                    .values_list('document_id', 'full_text')
                Doc2VecTransformer.create_vectors(
                    self.transformer, data, DocumentVector, 'document_id', save=True)
            return list(DocumentVector.objects.filter(document__in=qs, transformer=self.transformer))

        # self.feature_source == 'text'
        transformer = self.build_doc2vec_model()
        data = self.get_document_data(qs)
        return Doc2VecTransformer.create_vectors(transformer, data, DocumentVector, 'document_id')
Beispiel #7
0
    def get_vectors(self) -> List[TextUnitVector]:
        qs = self.get_queryset()    # type: TextUnit.objects

        if self.feature_source == 'vector':
            # this hangs forever
            # tu_wo_vectors = qs.exclude(textunitvector__transformer=self.transformer)
            tu_with_vectors = TextUnitVector.objects \
                .filter(transformer=self.transformer, text_unit__in=qs) \
                .values_list('text_unit_id', flat=True)
            tu_wo_vectors = qs.exclude(id__in=tu_with_vectors)
            if tu_wo_vectors.exists():
                data = TextUnitText.objects \
                    .filter(text_unit__in=tu_wo_vectors) \
                    .values_list('text_unit_id', 'text')
                Doc2VecTransformer.create_vectors(
                    self.transformer, data, TextUnitVector, 'text_unit_id', save=True)
            return list(TextUnitVector.objects.filter(text_unit__in=qs))

        # self.feature_source == 'text'
        transformer = self.build_doc2vec_model()
        data = TextUnitText.objects.filter(text_unit__in=qs).values_list('text_unit_id', 'text')
        return Doc2VecTransformer.create_vectors(transformer, data, TextUnitVector, 'text_unit_id')
    def save_feature_vectors(self):
        model_class = DocumentTransformer if self.source == 'document' else TextUnitTransformer
        vector_class = DocumentVector if self.source == 'document' else TextUnitVector
        transformer = (self.transformer if hasattr(self, 'transformer') else None) or \
            model_class.objects.get(pk=self.model_id)
        id_field = 'document_id' if self.source == 'document' else 'text_unit_id'

        if self.source == 'document':
            data_query = DocumentText.objects.all()
            if self.project_ids:
                data_query = data_query.filter(
                    document__project_id__in=self.project_ids)
            data = data_query.values_list(id_field, 'full_text')
        else:
            data_query = TextUnitText.objects.filter(
                text_unit__unit_type=self.text_unit_type)
            if self.project_ids:
                data_query = data_query.filter(
                    text_unit__document__project_id__in=self.project_ids)
            data = data_query.values_list(id_field, 'text')

        if self.delete_existing:
            data_ids = data_query.values_list(id_field, flat=True)
            delete_query = vector_class.objects.all()
            if self.source == 'document':
                delete_query = delete_query.filter(document_id__in=data_ids)
            else:
                delete_query = delete_query.filter(text_unit_id__in=data_ids)
            try:
                delete_query.delete()
            except Exception as e:
                self.log_error('Error deleting existing records', exc_info=e)
                raise

        vectors = Doc2VecTransformer.create_vectors(
            transformer, data, vector_class,
            id_field)  # type: List[BaseVector]
        # save vectors
        self.log_info(f'Saving {len(vectors)} vectors')
        if not vectors:
            return
        try:
            vector_class.objects.bulk_create(vectors, ignore_conflicts=True)
        except Exception as e:
            self.log_error(f'Error storing {vector_class.__name__}',
                           exc_info=e)
            raise
    def process(self, **kwargs):
        self.source = kwargs.get('source')
        transformer_class = DocumentTransformer if self.source == 'document' else TextUnitTransformer
        self.log_info(
            f'Training doc2vec model from {self.source.upper()} objects...')
        transformer_name = kwargs.get('transformer_name')
        if transformer_class.objects.filter(name=transformer_name).count() > 0:
            raise RuntimeError(
                f"There's already {transformer_class.__name__} with name '{transformer_name}'"
            )

        self.project_ids = kwargs.get('project_ids')
        vector_size = kwargs.get('vector_size')
        window = kwargs.get('window')
        min_count = kwargs.get('min_count')
        dm = kwargs.get('dm')
        build_vectors = kwargs.get('build_vectors')
        self.text_unit_type = kwargs.get(
            'text_unit_type') or self.text_unit_type

        transformer = Doc2VecTransformer(vector_size=vector_size,
                                         window=window,
                                         min_count=min_count,
                                         dm=dm)

        model_builder_args = dict(project_ids=self.project_ids,
                                  transformer_name=transformer_name)
        if self.source == 'document':
            model_builder = transformer.build_doc2vec_document_model
        else:
            model_builder = transformer.build_doc2vec_text_unit_model
            model_builder_args['text_unit_type'] = self.text_unit_type

        _, transformer = model_builder(
            **model_builder_args
        )  # gensim.models.doc2vec.Doc2Vec, BaseTransformer
        self.transformer = transformer
        if build_vectors:
            self.save_feature_vectors()
Beispiel #10
0
 def build_doc2vec_model(self) -> MLModel:
     transformer = Doc2VecTransformer(vector_size=100, window=10, min_count=10, dm=1)
     qs = self.get_queryset()    # type: TextUnit.objects
     _, trans_obj = transformer.build_doc2vec_text_unit_model(text_unit_qs=qs)
     return trans_obj
Beispiel #11
0
 def build_doc2vec_model(self) -> MLModel:
     transformer = Doc2VecTransformer(vector_size=100, window=10, min_count=10, dm=1,
                                      file_storage=self.file_storage)
     qs = self.get_queryset()    # type: Document.objects
     _, trans_obj = transformer.build_doc2vec_document_model(document_qs=qs)
     return trans_obj