Esempio n. 1
0
    def __init__(self, scores: Wrappable = None,
                       purity: Optional[float] = 0.95,
                       fscore: bool = False,
                       diarization: bool = False):
        super().__init__()

        if scores is None:
            scores = "@scd_scores"
        self.scores = scores
        self._scores = Wrapper(self.scores)

        self.purity = purity
        self.fscore = fscore
        self.diarization = diarization

        # hyper-parameters
        self.alpha = Uniform(0., 1.)
        self.min_duration = Uniform(0., 10.)
    def __init__(
        self,
        sad_scores: Union[Text, Path] = None,
        scd_scores: Union[Text, Path] = None,
        embedding: Union[Text, Path] = None,
        metric: Optional[str] = "cosine",
        method: Optional[str] = "pool",
        evaluation_only: Optional[bool] = False,
        purity: Optional[float] = None,
    ):

        super().__init__()
        self.sad_scores = sad_scores
        self.scd_scores = scd_scores
        if self.scd_scores == "oracle":
            if self.sad_scores == "oracle":
                self.speech_turn_segmentation = OracleSpeechTurnSegmentation()
            else:
                msg = (
                    f"Both sad_scores and scd_scores should be set to 'oracle' "
                    f"for oracle speech turn segmentation, "
                    f"got {self.sad_scores} and {self.scd_scores}, respectively."
                )
                raise ValueError(msg)
        else:
            self.speech_turn_segmentation = SpeechTurnSegmentation(
                sad_scores=self.sad_scores, scd_scores=self.scd_scores)
        self.evaluation_only = evaluation_only
        self.purity = purity

        self.min_duration = Uniform(0, 10)

        self.embedding = embedding
        self.metric = metric
        self.method = method
        self.speech_turn_clustering = SpeechTurnClustering(
            embedding=self.embedding, metric=self.metric, method=self.method)

        self.speech_turn_assignment = SpeechTurnClosestAssignment(
            embedding=self.embedding, metric=self.metric)
    def __init__(self, scores: Wrappable = None, fscore: bool = False):
        super().__init__()

        if scores is None:
            scores = "@sad_scores"
        self.scores = scores
        self._scores = Wrapper(self.scores)

        self.fscore = fscore

        # hyper-parameters
        self.onset = Uniform(0., 1.)
        self.offset = Uniform(0., 1.)
        self.min_duration_on = Uniform(0., 2.)
        self.min_duration_off = Uniform(0., 2.)
        self.pad_onset = Uniform(-1., 1.)
        self.pad_offset = Uniform(-1., 1.)
    def __init__(
        self,
        sad: Union[Text, Path] = {"sad": {
            "duration": 2.0,
            "step": 0.1
        }},
        emb: Union[Text, Path] = "emb",
        batch_size: int = None,
        only_sad: bool = False,
    ):

        super().__init__()

        self.sad = Wrapper(sad)
        if batch_size is not None:
            self.sad.batch_size = batch_size
        self.sad_speech_index_ = self.sad.classes.index("speech")

        self.sad_threshold_on = Uniform(0.0, 1.0)
        self.sad_threshold_off = Uniform(0.0, 1.0)
        self.sad_min_duration_on = Uniform(0.0, 0.5)
        self.sad_min_duration_off = Uniform(0.0, 0.5)

        self.only_sad = only_sad
        if self.only_sad:
            return

        self.emb = Wrapper(emb)
        if batch_size is not None:
            self.emb.batch_size = batch_size

        max_duration = self.emb.duration
        min_duration = getattr(self.emb, "min_duration", 0.25 * max_duration)
        self.emb_duration = Uniform(min_duration, max_duration)
        self.emb_step_ratio = Uniform(0.1, 1.0)
        self.emb_threshold = Uniform(0.0, 2.0)
    def __init__(self,
                 scores: Wrappable = None,
                 precision: float = 0.9,
                 fscore: bool = False):
        super().__init__()

        if scores is None:
            scores = "@ovl_scores"
        self.scores = scores
        self._scores = Wrapper(self.scores)

        self.precision = precision
        self.fscore = fscore

        # hyper-parameters
        self.onset = Uniform(0.0, 1.0)
        self.offset = Uniform(0.0, 1.0)
        self.min_duration_on = Uniform(0.0, 2.0)
        self.min_duration_off = Uniform(0.0, 2.0)
        self.pad_onset = Uniform(-1.0, 1.0)
        self.pad_offset = Uniform(-1.0, 1.0)
    def __init__(self, feature_extraction: Optional[dict] = None,
                       architecture: Optional[dict] = None,
                       overlap: Optional[bool] = False,
                       keep_sad: Optional[bool] = False,
                       mask: Optional[dict] = None,
                       augmentation: Optional[bool] = False,
                       duration: Optional[float] = 2.0,
                       batch_size: Optional[float] = 32,
                       gpu: Optional[bool] = False):

        # feature extraction
        if feature_extraction is None:
            from pyannote.audio.features import LibrosaMFCC
            self.feature_extraction_ = LibrosaMFCC(
                e=False, De=True, DDe=True,
                coefs=19, D=True, DD=True,
                duration=0.025, step=0.010, sample_rate=16000,
            )
        else:
            FeatureExtraction = get_class_by_name(
                feature_extraction['name'],
                default_module_name='pyannote.audio.features')
            self.feature_extraction_ = FeatureExtraction(
                **feature_extraction.get('params', {}),
                augmentation=None)

        # network architecture
        if architecture is None:
            from pyannote.audio.models import PyanNet
            self.Architecture_ = PyanNet
            self.architecture_params_ = {'sincnet': {'skip': True}}

        else:
            self.Architecture_ = get_class_by_name(
                architecture['name'],
                default_module_name='pyannote.audio.models')
            self.architecture_params_ = architecture.get('params', {})

        self.overlap = overlap
        self.keep_sad = keep_sad

        self.mask = mask
        if mask is None:
            self.mask_dimension_ = None
            self.mask_logscale_ = False
        else:
            self.mask_dimension_ = mask['dimension']
            self.mask_logscale_ = mask['log_scale']

        self.augmentation = augmentation

        self.duration = duration
        self.batch_size = batch_size
        self.gpu = gpu
        self.device_ = torch.device('cuda') if self.gpu else torch.device('cpu')

        # hyper-parameters
        self.learning_rate = LogUniform(1e-3, 1)
        self.epochs = Integer(10, 50)
        self.ensemble = Integer(1, 5)
        if self.overlap:
            self.overlap_threshold = Uniform(0, 1)