Example #1
0
 def _text_process(
         self, data: Dict[str, Union[str,
                                     np.ndarray]]) -> Dict[str, np.ndarray]:
     for i in range(self.num_tokenizer):
         text_name = self.text_name[i]
         if text_name in data and self.tokenizer[i] is not None:
             text = data[text_name]
             text = self.text_cleaner(text)
             tokens = self.tokenizer[i].text2tokens(text)
             text_ints = self.token_id_converter[i].tokens2ids(tokens)
             data[text_name] = np.array(text_ints, dtype=np.int64)
     assert check_return_type(data)
     return data
Example #2
0
 def optional_data_names(cls,
                         train: bool = True,
                         inference: bool = False) -> Tuple[str, ...]:
     retval = ["dereverb_ref"]
     retval += [
         "speech_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)
     ]
     retval += [
         "noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1)
     ]
     retval = tuple(retval)
     assert check_return_type(retval)
     return retval
Example #3
0
    def build_model(cls, args: argparse.Namespace) -> ESPnetTTSModel:
        assert check_argument_types()
        if isinstance(args.token_list, str):
            with open(args.token_list, encoding="utf-8") as f:
                token_list = [line.rstrip() for line in f]

            # "args" is saved as it is in a yaml file by BaseTask.main().
            # Overwriting token_list to keep it as "portable".
            args.token_list = token_list.copy()
        elif isinstance(args.token_list, (tuple, list)):
            token_list = args.token_list.copy()
        else:
            raise RuntimeError("token_list must be str or dict")

        vocab_size = len(token_list)
        logging.info(f"Vocabulary size: {vocab_size }")

        # 1. feats_extract
        if args.odim is None:
            # Extract features in the model
            feats_extract_class = feats_extractor_choices.get_class(
                args.feats_extract)
            feats_extract = feats_extract_class(**args.feats_extract_conf)
            odim = feats_extract.output_size()
        else:
            # Give features from data-loader
            args.feats_extract = None
            args.feats_extract_conf = None
            feats_extract = None
            odim = args.odim

        # 2. Normalization layer
        if args.normalize is not None:
            normalize_class = normalize_choices.get_class(args.normalize)
            normalize = normalize_class(**args.normalize_conf)
        else:
            normalize = None

        # 3. TTS
        tts_class = tts_choices.get_class(args.tts)
        tts = tts_class(idim=vocab_size, odim=odim, **args.tts_conf)

        # 4. Build model
        model = ESPnetTTSModel(
            feats_extract=feats_extract,
            normalize=normalize,
            tts=tts,
            **args.model_conf,
        )
        assert check_return_type(model)
        return model
Example #4
0
    def get_class(self, name: Optional[str]) -> Optional[type]:
        assert check_argument_types()
        if name is None or (self.optional
                            and name.lower() == ("none", "null", "nil")):
            retval = None
        elif name.lower() in self.classes:
            class_obj = self.classes[name]
            assert check_return_type(class_obj)
            retval = class_obj
        else:
            raise ValueError(f"--{self.name} must be one of {self.choices()}: "
                             f"--{self.name} {name.lower()}")

        return retval
Example #5
0
def _get_genotypes_col(entry: StructExpression,
                       sample_ids: Optional[List[str]]) -> Column:
    assert check_argument_types()

    entry_aliases = {
        'DP': 'depth',
        'FT': 'filters',
        'GL': 'genotypeLikelihoods',
        'PL': 'phredLikelihoods',
        'GP': 'posteriorProbabilities',
        'GQ': 'conditionalQuality',
        'HQ': 'haplotypeQualities',
        'EC': 'expectedAlleleCounts',
        'MQ': 'mappingQuality',
        'AD': 'alleleDepths'
    }

    base_struct_args = []
    for entry_field in entry:
        if entry_field == 'GT' and entry.GT.dtype == tcall:
            # Flatten GT into calls and phased
            base_struct_args.append(
                "'calls', e.GT.alleles, 'phased', e.GT.phased")
        elif entry[entry_field].dtype == tcall:
            # Turn other call fields (eg. PGT) into a string
            base_struct_args.append(
                f"'{entry_field}', array_join(e.{entry_field}.alleles, if(e.{entry_field}.phased, '|', '/'))"
            )
        elif entry_field in entry_aliases:
            # Rename aliased genotype fields
            base_struct_args.append(
                f"'{entry_aliases[entry_field]}', e.{entry_field}")
        else:
            # Rename genotype fields
            base_struct_args.append(f"'{entry_field}', e.{entry_field}")

    if sample_ids is not None:
        sample_id_expr = f"array({' ,'.join(sample_ids)})"
        struct_expr = ' ,'.join(["'sampleId', s"] + base_struct_args)
        genotypes_col = fx.expr(
            f"zip_with({sample_id_expr}, entries, (s, e) -> named_struct({struct_expr}))"
        )
    else:
        struct_expr = ' ,'.join(base_struct_args)
        genotypes_col = fx.expr(
            f"transform(entries, e -> named_struct({struct_expr}))")
    genotypes_col = genotypes_col.alias("genotypes")

    assert check_return_type(genotypes_col)
    return genotypes_col
Example #6
0
    def taskwrapper(*args, **kwargs):
        _repr_function(target=target, args=args, kwargs=kwargs)

        # Warning for explicit parameters
        if args:
            short = [str(a)[:20] for a in args]
            _LOGGER.warning("Use explicit parameters, instead of %s", short)

        # Warning on parameter types
        call_memo = _CallMemo(target, args=args, kwargs=kwargs)
        try:
            check_argument_types(call_memo)
        except TypeError as err:
            _LOGGER.warning(err)

        if validator:
            validator(kwargs)

        try:
            value = target(*args, **kwargs)
        except Exception as err:
            _LOGGER.error(
                "Error while running task `%s` - %s: %s", name, type(err).__name__, err
            )
            raise

        if isgeneratorfunction(target) or (
            isinstance(value, list) and not isinstance(value, ATable)
        ):
            value = ATable(value)

        try:
            check_return_type(value, call_memo)
        except TypeError as err:
            _LOGGER.error(err)

        return value
Example #7
0
def logistic_regression_gwas(genotypes: Union[Column, str],
                             phenotypes: Union[Column, str],
                             covariates: Union[Column, str],
                             test: str,
                             offset: Union[Column, str] = None) -> Column:
    """
    Performs a logistic regression association test optimized for performance in a GWAS setting. See :ref:`logistic-regression` for more details.

    Added in version 0.3.0.

    Examples:
        >>> from pyspark.ml.linalg import DenseMatrix
        >>> phenotypes = [1, 0, 0, 1, 1]
        >>> genotypes = [0, 0, 1, 2, 2]
        >>> covariates = DenseMatrix(numRows=5, numCols=1, values=[1, 1, 1, 1, 1])
        >>> offset = [1, 0, 1, 0, 1]
        >>> df = spark.createDataFrame([Row(genotypes=genotypes, phenotypes=phenotypes, covariates=covariates, offset=offset)])
        >>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'Firth'))).collect()
        [Row(beta=0.7418937644793101, oddsRatio=2.09990848346903, waldConfidenceInterval=[0.2509874689201784, 17.569066925598555], pValue=0.3952193664793294)]
        >>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'LRT'))).collect()
        [Row(beta=1.1658962684583645, oddsRatio=3.208797538802116, waldConfidenceInterval=[0.29709600522888285, 34.65674887513274], pValue=0.2943946848756769)]
        >>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'Firth', 'offset'))).collect()
        [Row(beta=0.8024832156793392, oddsRatio=2.231074294047771, waldConfidenceInterval=[0.2540891981649045, 19.590334974925725], pValue=0.3754070658316332)]
        >>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'LRT', 'offset'))).collect()
        [Row(beta=1.1996041727573317, oddsRatio=3.3188029900720117, waldConfidenceInterval=[0.3071189078535928, 35.863807161497334], pValue=0.2857137988674153)]

    Args:
        genotypes : An numeric array of genotypes
        phenotypes : A double array of phenotype values
        covariates : A ``spark.ml`` ``Matrix`` of covariates
        test : Which logistic regression test to use. Can be ``LRT`` or ``Firth``
        offset : An optional double array of offset values. The offset vector is added with coefficient 1 to the linear predictor term X*b.

    Returns:
        A struct containing ``beta``, ``oddsRatio``, ``waldConfidenceInterval``, and ``pValue`` fields. See :ref:`logistic-regression`.
    """
    assert check_argument_types()
    if offset is None:
        output = Column(
            sc()._jvm.io.projectglow.functions.logistic_regression_gwas(
                _to_java_column(genotypes), _to_java_column(phenotypes),
                _to_java_column(covariates), test))
    else:
        output = Column(
            sc()._jvm.io.projectglow.functions.logistic_regression_gwas(
                _to_java_column(genotypes), _to_java_column(phenotypes),
                _to_java_column(covariates), test, _to_java_column(offset)))
    assert check_return_type(output)
    return output
Example #8
0
def load_num_sequence_text(
        path: Union[Path, str],
        loader_type: str = "csv_int") -> Dict[str, np.ndarray]:
    """Read a text file indicating sequences of number

    Examples:
        key1 1 2 3
        key2 34 5 6

        >>> d = load_num_sequence_text('text')
        >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3]))
    """
    assert check_argument_types()
    if loader_type == "text_int":
        delimiter = " "
        dtype = np.long
    elif loader_type == "text_float":
        delimiter = " "
        dtype = np.float32
    elif loader_type == "csv_int":
        delimiter = ","
        dtype = np.long
    elif loader_type == "csv_float":
        delimiter = ","
        dtype = np.float32
    else:
        raise ValueError(f"Not supported loader_type={loader_type}")

    # path looks like:
    #   utta 1,0
    #   uttb 3,4,5
    # -> return {'utta': np.ndarray([1, 0]),
    #            'uttb': np.ndarray([3, 4, 5])}
    d = read_2column_text(path)

    # Using for-loop instead of dict-comprehension for debuggability
    retval = {}
    for k, v in d.items():
        try:
            retval[k] = np.loadtxt(StringIO(v),
                                   ndmin=1,
                                   dtype=dtype,
                                   delimiter=delimiter)
        except ValueError:
            logging.error(
                f'Error happened with path="{path}", id="{k}", value="{v}"')
            raise
    assert check_return_type(retval)
    return retval
Example #9
0
def _convert_numpy_to_java_array(np_arr: np.ndarray) -> JavaArray:
    """
    Converts a flat numpy array of doubles to a Java array of doubles.
    """
    assert check_argument_types()
    assert len(np_arr.shape) == 1
    assert np_arr.dtype.type == np.double

    sc = SparkContext._active_spark_context
    size = np_arr.shape[0]
    # Convert to big endian and serialize
    byte_arr = np.ascontiguousarray(np_arr, '>d').tobytes()
    java_arr = sc._jvm.io.projectglow.common.PythonUtils.doubleArrayFromBytes(size, byte_arr)
    assert check_return_type(java_arr)
    return java_arr
Example #10
0
def _convert_numpy_to_java_array(np_arr: np.ndarray) -> JavaArray:
    """
    Converts a flat numpy array of doubles to a Java array of doubles.
    """
    assert check_argument_types()
    assert len(np_arr.shape) == 1
    assert np_arr.dtype.type == np.double

    sc = SparkContext._active_spark_context
    java_arr = sc._gateway.new_array(sc._jvm.double, np_arr.shape[0])
    for idx, ele in enumerate(np_arr):
        java_arr[idx] = ele.item()

    assert check_return_type(java_arr)
    return java_arr
Example #11
0
File: lm.py Project: zy1022/espnet
 def build_preprocess_fn(
     cls, args: argparse.Namespace, train: bool
 ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
     assert check_argument_types()
     if args.use_preprocessor:
         retval = CommonPreprocessor(
             train=train,
             token_type=args.token_type,
             token_list=args.token_list,
             bpemodel=args.bpemodel,
         )
     else:
         retval = None
     assert check_return_type(retval)
     return retval
Example #12
0
    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel:
        assert check_argument_types()

        enh_model = enh_choices.get_class(args.enh)(**args.enh_conf)

        # 1. Build model
        model = ESPnetEnhancementModel(enh_model=enh_model)

        # FIXME(kamo): Should be done in model?
        # 2. Initialize
        if args.init is not None:
            initialize(model, args.init)

        assert check_return_type(model)
        return model
Example #13
0
def normalize_variant(contigName: Union[Column, str],
                      start: Union[Column, str], end: Union[Column, str],
                      refAllele: Union[Column, str], altAlleles: Union[Column,
                                                                       str],
                      refGenomePathString: str) -> Column:
    """
    Normalizes the variant with a behavior similar to vt normalize or bcftools norm.
    Creates a StructType column including the normalized ``start``, ``end``, ``referenceAllele`` and
    ``alternateAlleles`` fields (whether they are changed or unchanged as the result of
    normalization) as well as a StructType field called ``normalizationStatus`` that
    contains the following fields:

       ``changed``: A boolean field indicating whether the variant data was changed as a result of normalization

       ``errorMessage``: An error message in case the attempt at normalizing the row hit an error. In this case, the ``changed`` field will be set to ``false``. If no errors occur, this field will be ``null``.

    In case of an error, the ``start``, ``end``, ``referenceAllele`` and ``alternateAlleles`` fields in the generated struct will be ``null``.

    Added in version 0.3.0.

    Examples:
        >>> df = spark.read.format('vcf').load('test-data/variantsplitternormalizer-test/test_left_align_hg38_altered.vcf')
        >>> ref_genome = 'test-data/variantsplitternormalizer-test/Homo_sapiens_assembly38.20.21_altered.fasta'
        >>> df.select('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles').head()
        Row(contigName='chr20', start=400, end=401, referenceAllele='G', alternateAlleles=['GATCTTCCCTCTTTTCTAATATAAACACATAAAGCTCTGTTTCCTTCTAGGTAACTGGTTTGAG'])
        >>> normalized_df = df.select('contigName', glow.expand_struct(glow.normalize_variant('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles', ref_genome)))
        >>> normalized_df.head()
        Row(contigName='chr20', start=268, end=269, referenceAllele='A', alternateAlleles=['ATTTGAGATCTTCCCTCTTTTCTAATATAAACACATAAAGCTCTGTTTCCTTCTAGGTAACTGG'], normalizationStatus=Row(changed=True, errorMessage=None))

    Args:
        contigName : The current contig name
        start : The current start
        end : The current end
        refAllele : The current reference allele
        altAlleles : The current array of alternate alleles
        refGenomePathString : A path to the reference genome ``.fasta`` file. The ``.fasta`` file must be accompanied with a ``.fai`` index file in the same folder.

    Returns:
        A struct as explained above
    """
    assert check_argument_types()
    output = Column(sc()._jvm.io.projectglow.functions.normalize_variant(
        _to_java_column(contigName), _to_java_column(start),
        _to_java_column(end), _to_java_column(refAllele),
        _to_java_column(altAlleles), refGenomePathString))
    assert check_return_type(output)
    return output
Example #14
0
def block_variants_and_samples(
        variant_df: DataFrame, sample_ids: List[str], variants_per_block: int,
        sample_block_count: int) -> (DataFrame, Dict[str, List[str]]):
    """
    Creates a blocked GT matrix and index mapping from sample blocks to a list of corresponding sample IDs. Uses the
    same sample-blocking logic as the blocked GT matrix transformer.

    Requires that:
    - Each variant row has the same number of values
    - The number of values per row matches the number of sample IDs

    Args:
        variant_df : The variant DataFrame
        sample_ids : The list of sample ID strings
        variants_per_block : The number of variants per block
        sample_block_count : The number of sample blocks

    Returns:
        tuple of (blocked GT matrix, index mapping)
    """
    assert check_argument_types()
    distinct_num_values = variant_df.selectExpr(
        "size(values) as numValues").distinct()
    distinct_num_values_count = distinct_num_values.count()
    if distinct_num_values_count == 0:
        raise Exception("DataFrame has no values.")
    if distinct_num_values_count > 1:
        raise Exception("Each row must have the same number of values.")
    num_values = distinct_num_values.head().numValues
    if num_values != len(sample_ids):
        raise Exception(
            "Number of values does not match between DataFrame and sample ID list."
        )
    __validate_sample_ids(sample_ids)

    blocked_gt = glow.transform("block_variants_and_samples",
                                variant_df,
                                variants_per_block=variants_per_block,
                                sample_block_count=sample_block_count)
    index_map = __get_index_map(sample_ids, sample_block_count,
                                variant_df.sql_ctx)

    output = blocked_gt, index_map
    assert check_return_type(output)
    return output
Example #15
0
File: enh.py Project: akreal/espnet
    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel:
        assert check_argument_types()

        encoder = encoder_choices.get_class(args.encoder)(**args.encoder_conf)
        separator = separator_choices.get_class(args.separator)(
            encoder.output_dim, **args.separator_conf)
        decoder = decoder_choices.get_class(args.decoder)(**args.decoder_conf)
        if args.separator.endswith("nomask"):
            mask_module = mask_module_choices.get_class(args.mask_module)(
                input_dim=encoder.output_dim,
                **args.mask_module_conf,
            )
        else:
            mask_module = None

        loss_wrappers = []

        if getattr(args, "criterions", None) is not None:
            # This check is for the compatibility when load models
            # that packed by older version
            for ctr in args.criterions:
                criterion_conf = ctr.get("conf", {})
                criterion = criterion_choices.get_class(
                    ctr["name"])(**criterion_conf)
                loss_wrapper = loss_wrapper_choices.get_class(ctr["wrapper"])(
                    criterion=criterion, **ctr["wrapper_conf"])
                loss_wrappers.append(loss_wrapper)

        # 1. Build model
        model = ESPnetEnhancementModel(
            encoder=encoder,
            separator=separator,
            decoder=decoder,
            loss_wrappers=loss_wrappers,
            mask_module=mask_module,
            **args.model_conf,
        )

        # FIXME(kamo): Should be done in model?
        # 2. Initialize
        if args.init is not None:
            initialize(model, args.init)

        assert check_return_type(model)
        return model
Example #16
0
    def optional_data_names(cls,
                            train: bool = True,
                            inference: bool = False) -> Tuple[str, ...]:
        """Optional data depending on task mode.

        Args:
            cls: ASRTransducerTask object.
            train: Training mode.
            inference: Inference mode.

        Return:
            retval: Optional task data.

        """
        retval = ()
        assert check_return_type(retval)

        return retval
Example #17
0
def to_reported_value(v: Num, weight: Num = None) -> "ReportedValue":
    assert check_argument_types()
    if isinstance(v, (torch.Tensor, np.ndarray)):
        if np.prod(v.shape) != 1:
            raise ValueError(f"v must be 0 or 1 dimension: {len(v.shape)}")
        v = v.item()

    if isinstance(weight, (torch.Tensor, np.ndarray)):
        if np.prod(weight.shape) != 1:
            raise ValueError(f"weight must be 0 or 1 dimension: {len(weight.shape)}")
        weight = weight.item()

    if weight is not None:
        retval = WeightedAverage(v, weight)
    else:
        retval = Average(v)
    assert check_return_type(retval)
    return retval
Example #18
0
 def build_preprocess_fn(
     cls, args: argparse.Namespace, train: bool
 ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
     assert check_argument_types()
     if args.use_preprocessor:
         if "st" in args.subtask_series:
             retval = MutliTokenizerCommonPreprocessor(
                 train=train,
                 token_type=[args.token_type, args.src_token_type],
                 token_list=[args.token_list, args.src_token_list],
                 bpemodel=[args.bpemodel, args.src_bpemodel],
                 non_linguistic_symbols=args.non_linguistic_symbols,
                 text_cleaner=args.cleaner,
                 g2p_type=args.g2p,
                 # NOTE(kamo): Check attribute existence for backward compatibility
                 rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
                 rir_apply_prob=args.rir_apply_prob if hasattr(
                     args, "rir_apply_prob") else 1.0,
                 noise_scp=args.noise_scp
                 if hasattr(args, "noise_scp") else None,
                 noise_apply_prob=args.noise_apply_prob if hasattr(
                     args, "noise_apply_prob") else 1.0,
                 noise_db_range=args.noise_db_range if hasattr(
                     args, "noise_db_range") else "13_15",
                 speech_volume_normalize=args.speech_volume_normalize
                 if hasattr(args, "speech_volume_normalize") else None,
                 speech_name="speech",
                 text_name=["text", "src_text"],
             )
         else:
             retval = CommonPreprocessor_multi(
                 train=train,
                 token_type=args.token_type,
                 token_list=args.token_list,
                 bpemodel=args.bpemodel,
                 non_linguistic_symbols=args.non_linguistic_symbols,
                 text_name=["text"],
                 text_cleaner=args.cleaner,
                 g2p_type=args.g2p,
             )
     else:
         retval = None
     assert check_return_type(retval)
     return retval
Example #19
0
def lift_over_coordinates(contigName: Union[Column, str],
                          start: Union[Column, str],
                          end: Union[Column, str],
                          chainFile: str,
                          minMatchRatio: float = None) -> Column:
    """
    Performs liftover for the coordinates of a variant. To perform liftover of alleles and add additional metadata, see :ref:`liftover`.

    Added in version 0.3.0.

    Examples:
        >>> df = spark.read.format('vcf').load('test-data/liftover/unlifted.test.vcf').where('start = 18210071')
        >>> chain_file = 'test-data/liftover/hg38ToHg19.over.chain.gz'
        >>> reference_file = 'test-data/liftover/hg19.chr20.fa.gz'
        >>> df.select('contigName', 'start', 'end').head()
        Row(contigName='chr20', start=18210071, end=18210072)
        >>> lifted_df = df.select(glow.expand_struct(glow.lift_over_coordinates('contigName', 'start', 'end', chain_file)))
        >>> lifted_df.head()
        Row(contigName='chr20', start=18190715, end=18190716)

    Args:
        contigName : The current contig name
        start : The current start
        end : The current end
        chainFile : Location of the chain file on each node in the cluster
        minMatchRatio : Minimum fraction of bases that must remap to do liftover successfully. If not provided, defaults to ``0.95``.

    Returns:
        A struct containing ``contigName``, ``start``, and ``end`` fields after liftover
    """
    assert check_argument_types()
    if minMatchRatio is None:
        output = Column(
            sc()._jvm.io.projectglow.functions.lift_over_coordinates(
                _to_java_column(contigName), _to_java_column(start),
                _to_java_column(end), chainFile))
    else:
        output = Column(
            sc()._jvm.io.projectglow.functions.lift_over_coordinates(
                _to_java_column(contigName), _to_java_column(start),
                _to_java_column(end), chainFile, minMatchRatio))
    assert check_return_type(output)
    return output
Example #20
0
 def build_preprocess_fn(
     cls, args: argparse.Namespace, train: bool
 ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
     assert check_argument_types()
     if args.use_preprocessor:
         retval = MutliTokenizerCommonPreprocessor(
             train=train,
             token_type=[args.token_type, args.src_token_type],
             token_list=[args.token_list, args.src_token_list],
             bpemodel=[args.bpemodel, args.src_bpemodel],
             non_linguistic_symbols=args.non_linguistic_symbols,
             text_cleaner=args.cleaner,
             g2p_type=args.g2p,
             text_name=["text", "src_text"],
         )
     else:
         retval = None
     assert check_return_type(retval)
     return retval
Example #21
0
def _get_other_cols(row: StructExpression) -> List[Column]:
    assert check_argument_types()

    other_cols = []
    if 'cm_position' in row and row.cm_position.dtype == tfloat64:
        other_cols.append(fx.col("cm_position").alias("position"))
    if 'qual' in row and row.qual.dtype == tfloat64:
        # -10 qual means missing
        other_cols.append(fx.expr("if(qual = -10, null, qual)").alias("qual"))
    # [] filters means PASS, null filters means missing
    if 'filters' in row and row.filters.dtype == tset(tstr):
        other_cols.append(fx.expr("if(size(filters) = 0, array('PASS'), filters)").alias("filters"))
    # Rename info.* columns to INFO_*
    if 'info' in row and isinstance(row.info.dtype, tstruct):
        for f in row.info:
            other_cols.append(fx.col(f"`info.{f}`").alias(f"INFO_{f}"))

    assert check_return_type(other_cols)
    return other_cols
Example #22
0
def aggregate(values: Sequence["ReportedValue"]) -> Num:
    assert check_argument_types()

    for v in values:
        if not isinstance(v, type(values[0])):
            raise ValueError(
                f"Can't use different Reported type together: "
                f"{type(v)} != {type(values[0])}"
            )

    if len(values) == 0:
        warnings.warn("No stats found")
        retval = np.nan

    elif isinstance(values[0], Average):
        retval = np.nanmean([v.value for v in values])

    elif isinstance(values[0], WeightedAverage):
        # Excludes non finite values
        invalid_indices = set()
        for i, v in enumerate(values):
            if not np.isfinite(v.value) or not np.isfinite(v.weight):
                invalid_indices.add(i)
        values = [v for i, v in enumerate(values) if i not in invalid_indices]

        if len(values) != 0:
            # Calc weighed average. Weights are changed to sum-to-1.
            sum_weights = sum(v.weight for i, v in enumerate(values))
            sum_value = sum(v.value * v.weight for i, v in enumerate(values))
            if sum_weights == 0:
                warnings.warn("weight is zero")
                retval = np.nan
            else:
                retval = sum_value / sum_weights
        else:
            warnings.warn("No valid stats found")
            retval = np.nan

    else:
        raise NotImplementedError(f"type={type(values[0])}")
    assert check_return_type(retval)
    return retval
Example #23
0
    def build_preprocess_fn(
        cls, args: argparse.Namespace, train: bool
    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
        """Build pre-processing function.

        Args:
            cls: ASRTransducerTask object.
            args: Task arguments.
            train: Training mode.

        Return:
            : Callable pre-processing function.

        """
        assert check_argument_types()

        if args.use_preprocessor:
            retval = CommonPreprocessor(
                train=train,
                token_type=args.token_type,
                token_list=args.token_list,
                bpemodel=args.bpemodel,
                non_linguistic_symbols=args.non_linguistic_symbols,
                text_cleaner=args.cleaner,
                g2p_type=args.g2p,
                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
                rir_apply_prob=args.rir_apply_prob if hasattr(
                    args, "rir_apply_prob") else 1.0,
                noise_scp=args.noise_scp
                if hasattr(args, "noise_scp") else None,
                noise_apply_prob=args.noise_apply_prob if hasattr(
                    args, "noise_apply_prob") else 1.0,
                noise_db_range=args.noise_db_range if hasattr(
                    args, "noise_db_range") else "13_15",
                speech_volume_normalize=args.speech_volume_normalize
                if hasattr(args, "rir_scp") else None,
            )
        else:
            retval = None

        assert check_return_type(retval)
        return retval
Example #24
0
    def __call__(
        self, uid: str, data: Dict[str, Union[str, np.ndarray]]
    ) -> Dict[str, np.ndarray]:
        assert check_argument_types()

        if self.speech_name in data:
            # Nothing now: candidates:
            # - STFT
            # - Fbank
            # - CMVN
            # - Data augmentation
            pass

        if self.text_name in data and self.tokenizer is not None:
            text = data[self.text_name]
            tokens = self.tokenizer.text2tokens(text)
            text_ints = self.token_id_converter.tokens2ids(tokens)
            data[self.text_name] = np.array(text_ints, dtype=np.int64)
        assert check_return_type(data)
        return data
Example #25
0
    def build_model(cls, args: argparse.Namespace) -> ModNet:
        assert check_argument_types()

        # 1. frontend
        if args.input_size is None:
            # Extract features in the model
            frontend_class = frontend_choices.get_class(args.frontend)
            frontend = frontend_class(**args.frontend_conf)
            input_size = frontend.output_size()
        else:
            # Give features from data-loader
            args.frontend = None
            args.frontend_conf = {}
            frontend = None
            input_size = args.input_size

        # 4. Encoder
        encoder_class = encoder_choices.get_class(args.encoder)
        encoder = encoder_class(input_size=input_size, **args.encoder_conf)

        # Projection
        projector_class = projector_choices.get_class(args.projector)
        encoder_output_size = encoder.output_size()
        projector = projector_class(input_size=encoder_output_size,
                                    output_size=input_size)

        # 8. Build model
        model = ModNet(
            frontend=frontend,
            encoder=encoder,
            projector=projector,
            **args.model_conf,
        )

        # FIXME(kamo): Should be done in model?
        # 9. Initialize
        if args.init is not None:
            initialize(model, args.init)

        assert check_return_type(model)
        return model
Example #26
0
def hard_calls(probabilities: Union[Column, str],
               numAlts: Union[Column, str],
               phased: Union[Column, str],
               threshold: float = None) -> Column:
    """
    Converts an array of probabilities to hard calls. The probabilities are assumed to be diploid. See :ref:`variant-data-transformations` for more details.

    Added in version 0.3.0.

    Examples:
        >>> df = spark.createDataFrame([Row(probs=[0.95, 0.05, 0.0])])
        >>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False)).alias('calls')).collect()
        [Row(calls=[0, 0])]
        >>> df = spark.createDataFrame([Row(probs=[0.05, 0.95, 0.0])])
        >>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False)).alias('calls')).collect()
        [Row(calls=[0, 1])]
        >>> # Use the threshold parameter to change the minimum probability required for a call
        >>> df = spark.createDataFrame([Row(probs=[0.05, 0.95, 0.0])])
        >>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False), threshold=0.99).alias('calls')).collect()
        [Row(calls=[-1, -1])]

    Args:
        probabilities : The array of probabilities to convert
        numAlts : The number of alternate alleles
        phased : Whether the probabilities are phased. If phased, we expect one ``2 * numAlts`` values in the probabilities array. If unphased, we expect one probability per possible genotype.
        threshold : The minimum probability to make a call. If no probability falls into the range of ``[0, 1 - threshold]`` or ``[threshold, 1]``, a no-call (represented by ``-1`` s) will be emitted. If not provided, this parameter defaults to ``0.9``.

    Returns:
        An array of hard calls
    """
    assert check_argument_types()
    if threshold is None:
        output = Column(sc()._jvm.io.projectglow.functions.hard_calls(
            _to_java_column(probabilities), _to_java_column(numAlts),
            _to_java_column(phased)))
    else:
        output = Column(sc()._jvm.io.projectglow.functions.hard_calls(
            _to_java_column(probabilities), _to_java_column(numAlts),
            _to_java_column(phased), threshold))
    assert check_return_type(output)
    return output
Example #27
0
 def build_preprocess_fn(
     cls, args: argparse.Namespace, train: bool
 ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
     assert check_argument_types()
     # TODO(Jing): ask Kamo if it ok to support several args,
     # like text_name = 'text_ref1' and 'text_ref2'
     if args.use_preprocessor:
         retval = CommonPreprocessor_multi(
             train=train,
             token_type=args.token_type,
             token_list=args.token_list,
             bpemodel=args.bpemodel,
             non_linguistic_symbols=args.non_linguistic_symbols,
             text_name=["text_ref1", "text_ref2"],
             text_cleaner=args.cleaner,
             g2p_type=args.g2p,
         )
     else:
         retval = None
     assert check_return_type(retval)
     return retval
Example #28
0
    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel:
        assert check_argument_types()

        encoder = encoder_choices.get_class(args.encoder)(**args.encoder_conf)
        separator = separator_choices.get_class(args.separator)(
            encoder.output_dim, **args.separator_conf)
        decoder = decoder_choices.get_class(args.decoder)(**args.decoder_conf)

        # 1. Build model
        model = ESPnetEnhancementModel(encoder=encoder,
                                       separator=separator,
                                       decoder=decoder,
                                       **args.model_conf)

        # FIXME(kamo): Should be done in model?
        # 2. Initialize
        if args.init is not None:
            initialize(model, args.init)

        assert check_return_type(model)
        return model
Example #29
0
    def _speech_process(
        self, data: Dict[str, Union[str, np.ndarray]]
    ) -> Dict[str, Union[str, np.ndarray]]:
        assert check_argument_types()
        if self.speech_name in data:
            if self.train and (self.rirs is not None
                               or self.noises is not None):
                speech = data[self.speech_name]

                # speech: (Nmic, Time)
                if speech.ndim == 1:
                    speech = speech[None, :]
                else:
                    speech = speech.T
                # Calc power on non silence region
                power = (speech[detect_non_silence(speech)]**2).mean()

                # 1. Convolve RIR
                if self.rirs is not None and self.rir_apply_prob >= np.random.random(
                ):
                    speech, _ = self._convolve_rir(speech, power)

                # 2. Add Noise
                if (self.noises is not None
                        and self.noise_apply_prob >= np.random.random()):
                    speech, _ = self._add_noise(speech, power)

                speech = speech.T
                ma = np.max(np.abs(speech))
                if ma > 1.0:
                    speech /= ma
                data[self.speech_name] = speech

            if self.speech_volume_normalize is not None:
                speech = data[self.speech_name]
                ma = np.max(np.abs(speech))
                data[self.
                     speech_name] = speech * self.speech_volume_normalize / ma
        assert check_return_type(data)
        return data
Example #30
0
    def build_model(cls, args: argparse.Namespace) -> ESPnetEnhS2TModel:
        assert check_argument_types()

        # Build submodels in the order of subtask_series
        model_conf = args.model_conf.copy()
        for _, subtask in enumerate(args.subtask_series):
            subtask_conf = dict(
                init=None, model_conf=eval(f"args.{subtask}_model_conf")
            )

            for attr in eval(f"{subtask}_attributes"):
                subtask_conf[attr] = (
                    getattr(args, subtask + "_" + attr, None)
                    if getattr(args, subtask + "_" + attr, None) is not None
                    else getattr(args, attr, None)
                )

            if subtask in ["asr", "st", "diar"]:
                m_subtask = "s2t"
            elif subtask in ["enh"]:
                m_subtask = subtask
            else:
                raise ValueError(f"{subtask} not supported.")

            logging.info(f"Building {subtask} task model, using config: {subtask_conf}")

            model_conf[f"{m_subtask}_model"] = name2task[subtask].build_model(
                argparse.Namespace(**subtask_conf)
            )

        # 8. Build model
        model = ESPnetEnhS2TModel(**model_conf)

        # FIXME(kamo): Should be done in model?
        # 9. Initialize
        if args.init is not None:
            initialize(model, args.init)

        assert check_return_type(model)
        return model