def _text_process( self, data: Dict[str, Union[str, np.ndarray]]) -> Dict[str, np.ndarray]: for i in range(self.num_tokenizer): text_name = self.text_name[i] if text_name in data and self.tokenizer[i] is not None: text = data[text_name] text = self.text_cleaner(text) tokens = self.tokenizer[i].text2tokens(text) text_ints = self.token_id_converter[i].tokens2ids(tokens) data[text_name] = np.array(text_ints, dtype=np.int64) assert check_return_type(data) return data
def optional_data_names(cls, train: bool = True, inference: bool = False) -> Tuple[str, ...]: retval = ["dereverb_ref"] retval += [ "speech_ref{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1) ] retval += [ "noise_ref{}".format(n) for n in range(1, MAX_REFERENCE_NUM + 1) ] retval = tuple(retval) assert check_return_type(retval) return retval
def build_model(cls, args: argparse.Namespace) -> ESPnetTTSModel: assert check_argument_types() if isinstance(args.token_list, str): with open(args.token_list, encoding="utf-8") as f: token_list = [line.rstrip() for line in f] # "args" is saved as it is in a yaml file by BaseTask.main(). # Overwriting token_list to keep it as "portable". args.token_list = token_list.copy() elif isinstance(args.token_list, (tuple, list)): token_list = args.token_list.copy() else: raise RuntimeError("token_list must be str or dict") vocab_size = len(token_list) logging.info(f"Vocabulary size: {vocab_size }") # 1. feats_extract if args.odim is None: # Extract features in the model feats_extract_class = feats_extractor_choices.get_class( args.feats_extract) feats_extract = feats_extract_class(**args.feats_extract_conf) odim = feats_extract.output_size() else: # Give features from data-loader args.feats_extract = None args.feats_extract_conf = None feats_extract = None odim = args.odim # 2. Normalization layer if args.normalize is not None: normalize_class = normalize_choices.get_class(args.normalize) normalize = normalize_class(**args.normalize_conf) else: normalize = None # 3. TTS tts_class = tts_choices.get_class(args.tts) tts = tts_class(idim=vocab_size, odim=odim, **args.tts_conf) # 4. Build model model = ESPnetTTSModel( feats_extract=feats_extract, normalize=normalize, tts=tts, **args.model_conf, ) assert check_return_type(model) return model
def get_class(self, name: Optional[str]) -> Optional[type]: assert check_argument_types() if name is None or (self.optional and name.lower() == ("none", "null", "nil")): retval = None elif name.lower() in self.classes: class_obj = self.classes[name] assert check_return_type(class_obj) retval = class_obj else: raise ValueError(f"--{self.name} must be one of {self.choices()}: " f"--{self.name} {name.lower()}") return retval
def _get_genotypes_col(entry: StructExpression, sample_ids: Optional[List[str]]) -> Column: assert check_argument_types() entry_aliases = { 'DP': 'depth', 'FT': 'filters', 'GL': 'genotypeLikelihoods', 'PL': 'phredLikelihoods', 'GP': 'posteriorProbabilities', 'GQ': 'conditionalQuality', 'HQ': 'haplotypeQualities', 'EC': 'expectedAlleleCounts', 'MQ': 'mappingQuality', 'AD': 'alleleDepths' } base_struct_args = [] for entry_field in entry: if entry_field == 'GT' and entry.GT.dtype == tcall: # Flatten GT into calls and phased base_struct_args.append( "'calls', e.GT.alleles, 'phased', e.GT.phased") elif entry[entry_field].dtype == tcall: # Turn other call fields (eg. PGT) into a string base_struct_args.append( f"'{entry_field}', array_join(e.{entry_field}.alleles, if(e.{entry_field}.phased, '|', '/'))" ) elif entry_field in entry_aliases: # Rename aliased genotype fields base_struct_args.append( f"'{entry_aliases[entry_field]}', e.{entry_field}") else: # Rename genotype fields base_struct_args.append(f"'{entry_field}', e.{entry_field}") if sample_ids is not None: sample_id_expr = f"array({' ,'.join(sample_ids)})" struct_expr = ' ,'.join(["'sampleId', s"] + base_struct_args) genotypes_col = fx.expr( f"zip_with({sample_id_expr}, entries, (s, e) -> named_struct({struct_expr}))" ) else: struct_expr = ' ,'.join(base_struct_args) genotypes_col = fx.expr( f"transform(entries, e -> named_struct({struct_expr}))") genotypes_col = genotypes_col.alias("genotypes") assert check_return_type(genotypes_col) return genotypes_col
def taskwrapper(*args, **kwargs): _repr_function(target=target, args=args, kwargs=kwargs) # Warning for explicit parameters if args: short = [str(a)[:20] for a in args] _LOGGER.warning("Use explicit parameters, instead of %s", short) # Warning on parameter types call_memo = _CallMemo(target, args=args, kwargs=kwargs) try: check_argument_types(call_memo) except TypeError as err: _LOGGER.warning(err) if validator: validator(kwargs) try: value = target(*args, **kwargs) except Exception as err: _LOGGER.error( "Error while running task `%s` - %s: %s", name, type(err).__name__, err ) raise if isgeneratorfunction(target) or ( isinstance(value, list) and not isinstance(value, ATable) ): value = ATable(value) try: check_return_type(value, call_memo) except TypeError as err: _LOGGER.error(err) return value
def logistic_regression_gwas(genotypes: Union[Column, str], phenotypes: Union[Column, str], covariates: Union[Column, str], test: str, offset: Union[Column, str] = None) -> Column: """ Performs a logistic regression association test optimized for performance in a GWAS setting. See :ref:`logistic-regression` for more details. Added in version 0.3.0. Examples: >>> from pyspark.ml.linalg import DenseMatrix >>> phenotypes = [1, 0, 0, 1, 1] >>> genotypes = [0, 0, 1, 2, 2] >>> covariates = DenseMatrix(numRows=5, numCols=1, values=[1, 1, 1, 1, 1]) >>> offset = [1, 0, 1, 0, 1] >>> df = spark.createDataFrame([Row(genotypes=genotypes, phenotypes=phenotypes, covariates=covariates, offset=offset)]) >>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'Firth'))).collect() [Row(beta=0.7418937644793101, oddsRatio=2.09990848346903, waldConfidenceInterval=[0.2509874689201784, 17.569066925598555], pValue=0.3952193664793294)] >>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'LRT'))).collect() [Row(beta=1.1658962684583645, oddsRatio=3.208797538802116, waldConfidenceInterval=[0.29709600522888285, 34.65674887513274], pValue=0.2943946848756769)] >>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'Firth', 'offset'))).collect() [Row(beta=0.8024832156793392, oddsRatio=2.231074294047771, waldConfidenceInterval=[0.2540891981649045, 19.590334974925725], pValue=0.3754070658316332)] >>> df.select(glow.expand_struct(glow.logistic_regression_gwas('genotypes', 'phenotypes', 'covariates', 'LRT', 'offset'))).collect() [Row(beta=1.1996041727573317, oddsRatio=3.3188029900720117, waldConfidenceInterval=[0.3071189078535928, 35.863807161497334], pValue=0.2857137988674153)] Args: genotypes : An numeric array of genotypes phenotypes : A double array of phenotype values covariates : A ``spark.ml`` ``Matrix`` of covariates test : Which logistic regression test to use. Can be ``LRT`` or ``Firth`` offset : An optional double array of offset values. The offset vector is added with coefficient 1 to the linear predictor term X*b. Returns: A struct containing ``beta``, ``oddsRatio``, ``waldConfidenceInterval``, and ``pValue`` fields. See :ref:`logistic-regression`. """ assert check_argument_types() if offset is None: output = Column( sc()._jvm.io.projectglow.functions.logistic_regression_gwas( _to_java_column(genotypes), _to_java_column(phenotypes), _to_java_column(covariates), test)) else: output = Column( sc()._jvm.io.projectglow.functions.logistic_regression_gwas( _to_java_column(genotypes), _to_java_column(phenotypes), _to_java_column(covariates), test, _to_java_column(offset))) assert check_return_type(output) return output
def load_num_sequence_text( path: Union[Path, str], loader_type: str = "csv_int") -> Dict[str, np.ndarray]: """Read a text file indicating sequences of number Examples: key1 1 2 3 key2 34 5 6 >>> d = load_num_sequence_text('text') >>> np.testing.assert_array_equal(d["key1"], np.array([1, 2, 3])) """ assert check_argument_types() if loader_type == "text_int": delimiter = " " dtype = np.long elif loader_type == "text_float": delimiter = " " dtype = np.float32 elif loader_type == "csv_int": delimiter = "," dtype = np.long elif loader_type == "csv_float": delimiter = "," dtype = np.float32 else: raise ValueError(f"Not supported loader_type={loader_type}") # path looks like: # utta 1,0 # uttb 3,4,5 # -> return {'utta': np.ndarray([1, 0]), # 'uttb': np.ndarray([3, 4, 5])} d = read_2column_text(path) # Using for-loop instead of dict-comprehension for debuggability retval = {} for k, v in d.items(): try: retval[k] = np.loadtxt(StringIO(v), ndmin=1, dtype=dtype, delimiter=delimiter) except ValueError: logging.error( f'Error happened with path="{path}", id="{k}", value="{v}"') raise assert check_return_type(retval) return retval
def _convert_numpy_to_java_array(np_arr: np.ndarray) -> JavaArray: """ Converts a flat numpy array of doubles to a Java array of doubles. """ assert check_argument_types() assert len(np_arr.shape) == 1 assert np_arr.dtype.type == np.double sc = SparkContext._active_spark_context size = np_arr.shape[0] # Convert to big endian and serialize byte_arr = np.ascontiguousarray(np_arr, '>d').tobytes() java_arr = sc._jvm.io.projectglow.common.PythonUtils.doubleArrayFromBytes(size, byte_arr) assert check_return_type(java_arr) return java_arr
def _convert_numpy_to_java_array(np_arr: np.ndarray) -> JavaArray: """ Converts a flat numpy array of doubles to a Java array of doubles. """ assert check_argument_types() assert len(np_arr.shape) == 1 assert np_arr.dtype.type == np.double sc = SparkContext._active_spark_context java_arr = sc._gateway.new_array(sc._jvm.double, np_arr.shape[0]) for idx, ele in enumerate(np_arr): java_arr[idx] = ele.item() assert check_return_type(java_arr) return java_arr
def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( train=train, token_type=args.token_type, token_list=args.token_list, bpemodel=args.bpemodel, ) else: retval = None assert check_return_type(retval) return retval
def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel: assert check_argument_types() enh_model = enh_choices.get_class(args.enh)(**args.enh_conf) # 1. Build model model = ESPnetEnhancementModel(enh_model=enh_model) # FIXME(kamo): Should be done in model? # 2. Initialize if args.init is not None: initialize(model, args.init) assert check_return_type(model) return model
def normalize_variant(contigName: Union[Column, str], start: Union[Column, str], end: Union[Column, str], refAllele: Union[Column, str], altAlleles: Union[Column, str], refGenomePathString: str) -> Column: """ Normalizes the variant with a behavior similar to vt normalize or bcftools norm. Creates a StructType column including the normalized ``start``, ``end``, ``referenceAllele`` and ``alternateAlleles`` fields (whether they are changed or unchanged as the result of normalization) as well as a StructType field called ``normalizationStatus`` that contains the following fields: ``changed``: A boolean field indicating whether the variant data was changed as a result of normalization ``errorMessage``: An error message in case the attempt at normalizing the row hit an error. In this case, the ``changed`` field will be set to ``false``. If no errors occur, this field will be ``null``. In case of an error, the ``start``, ``end``, ``referenceAllele`` and ``alternateAlleles`` fields in the generated struct will be ``null``. Added in version 0.3.0. Examples: >>> df = spark.read.format('vcf').load('test-data/variantsplitternormalizer-test/test_left_align_hg38_altered.vcf') >>> ref_genome = 'test-data/variantsplitternormalizer-test/Homo_sapiens_assembly38.20.21_altered.fasta' >>> df.select('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles').head() Row(contigName='chr20', start=400, end=401, referenceAllele='G', alternateAlleles=['GATCTTCCCTCTTTTCTAATATAAACACATAAAGCTCTGTTTCCTTCTAGGTAACTGGTTTGAG']) >>> normalized_df = df.select('contigName', glow.expand_struct(glow.normalize_variant('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles', ref_genome))) >>> normalized_df.head() Row(contigName='chr20', start=268, end=269, referenceAllele='A', alternateAlleles=['ATTTGAGATCTTCCCTCTTTTCTAATATAAACACATAAAGCTCTGTTTCCTTCTAGGTAACTGG'], normalizationStatus=Row(changed=True, errorMessage=None)) Args: contigName : The current contig name start : The current start end : The current end refAllele : The current reference allele altAlleles : The current array of alternate alleles refGenomePathString : A path to the reference genome ``.fasta`` file. The ``.fasta`` file must be accompanied with a ``.fai`` index file in the same folder. Returns: A struct as explained above """ assert check_argument_types() output = Column(sc()._jvm.io.projectglow.functions.normalize_variant( _to_java_column(contigName), _to_java_column(start), _to_java_column(end), _to_java_column(refAllele), _to_java_column(altAlleles), refGenomePathString)) assert check_return_type(output) return output
def block_variants_and_samples( variant_df: DataFrame, sample_ids: List[str], variants_per_block: int, sample_block_count: int) -> (DataFrame, Dict[str, List[str]]): """ Creates a blocked GT matrix and index mapping from sample blocks to a list of corresponding sample IDs. Uses the same sample-blocking logic as the blocked GT matrix transformer. Requires that: - Each variant row has the same number of values - The number of values per row matches the number of sample IDs Args: variant_df : The variant DataFrame sample_ids : The list of sample ID strings variants_per_block : The number of variants per block sample_block_count : The number of sample blocks Returns: tuple of (blocked GT matrix, index mapping) """ assert check_argument_types() distinct_num_values = variant_df.selectExpr( "size(values) as numValues").distinct() distinct_num_values_count = distinct_num_values.count() if distinct_num_values_count == 0: raise Exception("DataFrame has no values.") if distinct_num_values_count > 1: raise Exception("Each row must have the same number of values.") num_values = distinct_num_values.head().numValues if num_values != len(sample_ids): raise Exception( "Number of values does not match between DataFrame and sample ID list." ) __validate_sample_ids(sample_ids) blocked_gt = glow.transform("block_variants_and_samples", variant_df, variants_per_block=variants_per_block, sample_block_count=sample_block_count) index_map = __get_index_map(sample_ids, sample_block_count, variant_df.sql_ctx) output = blocked_gt, index_map assert check_return_type(output) return output
def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel: assert check_argument_types() encoder = encoder_choices.get_class(args.encoder)(**args.encoder_conf) separator = separator_choices.get_class(args.separator)( encoder.output_dim, **args.separator_conf) decoder = decoder_choices.get_class(args.decoder)(**args.decoder_conf) if args.separator.endswith("nomask"): mask_module = mask_module_choices.get_class(args.mask_module)( input_dim=encoder.output_dim, **args.mask_module_conf, ) else: mask_module = None loss_wrappers = [] if getattr(args, "criterions", None) is not None: # This check is for the compatibility when load models # that packed by older version for ctr in args.criterions: criterion_conf = ctr.get("conf", {}) criterion = criterion_choices.get_class( ctr["name"])(**criterion_conf) loss_wrapper = loss_wrapper_choices.get_class(ctr["wrapper"])( criterion=criterion, **ctr["wrapper_conf"]) loss_wrappers.append(loss_wrapper) # 1. Build model model = ESPnetEnhancementModel( encoder=encoder, separator=separator, decoder=decoder, loss_wrappers=loss_wrappers, mask_module=mask_module, **args.model_conf, ) # FIXME(kamo): Should be done in model? # 2. Initialize if args.init is not None: initialize(model, args.init) assert check_return_type(model) return model
def optional_data_names(cls, train: bool = True, inference: bool = False) -> Tuple[str, ...]: """Optional data depending on task mode. Args: cls: ASRTransducerTask object. train: Training mode. inference: Inference mode. Return: retval: Optional task data. """ retval = () assert check_return_type(retval) return retval
def to_reported_value(v: Num, weight: Num = None) -> "ReportedValue": assert check_argument_types() if isinstance(v, (torch.Tensor, np.ndarray)): if np.prod(v.shape) != 1: raise ValueError(f"v must be 0 or 1 dimension: {len(v.shape)}") v = v.item() if isinstance(weight, (torch.Tensor, np.ndarray)): if np.prod(weight.shape) != 1: raise ValueError(f"weight must be 0 or 1 dimension: {len(weight.shape)}") weight = weight.item() if weight is not None: retval = WeightedAverage(v, weight) else: retval = Average(v) assert check_return_type(retval) return retval
def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: assert check_argument_types() if args.use_preprocessor: if "st" in args.subtask_series: retval = MutliTokenizerCommonPreprocessor( train=train, token_type=[args.token_type, args.src_token_type], token_list=[args.token_list, args.src_token_list], bpemodel=[args.bpemodel, args.src_bpemodel], non_linguistic_symbols=args.non_linguistic_symbols, text_cleaner=args.cleaner, g2p_type=args.g2p, # NOTE(kamo): Check attribute existence for backward compatibility rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None, rir_apply_prob=args.rir_apply_prob if hasattr( args, "rir_apply_prob") else 1.0, noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None, noise_apply_prob=args.noise_apply_prob if hasattr( args, "noise_apply_prob") else 1.0, noise_db_range=args.noise_db_range if hasattr( args, "noise_db_range") else "13_15", speech_volume_normalize=args.speech_volume_normalize if hasattr(args, "speech_volume_normalize") else None, speech_name="speech", text_name=["text", "src_text"], ) else: retval = CommonPreprocessor_multi( train=train, token_type=args.token_type, token_list=args.token_list, bpemodel=args.bpemodel, non_linguistic_symbols=args.non_linguistic_symbols, text_name=["text"], text_cleaner=args.cleaner, g2p_type=args.g2p, ) else: retval = None assert check_return_type(retval) return retval
def lift_over_coordinates(contigName: Union[Column, str], start: Union[Column, str], end: Union[Column, str], chainFile: str, minMatchRatio: float = None) -> Column: """ Performs liftover for the coordinates of a variant. To perform liftover of alleles and add additional metadata, see :ref:`liftover`. Added in version 0.3.0. Examples: >>> df = spark.read.format('vcf').load('test-data/liftover/unlifted.test.vcf').where('start = 18210071') >>> chain_file = 'test-data/liftover/hg38ToHg19.over.chain.gz' >>> reference_file = 'test-data/liftover/hg19.chr20.fa.gz' >>> df.select('contigName', 'start', 'end').head() Row(contigName='chr20', start=18210071, end=18210072) >>> lifted_df = df.select(glow.expand_struct(glow.lift_over_coordinates('contigName', 'start', 'end', chain_file))) >>> lifted_df.head() Row(contigName='chr20', start=18190715, end=18190716) Args: contigName : The current contig name start : The current start end : The current end chainFile : Location of the chain file on each node in the cluster minMatchRatio : Minimum fraction of bases that must remap to do liftover successfully. If not provided, defaults to ``0.95``. Returns: A struct containing ``contigName``, ``start``, and ``end`` fields after liftover """ assert check_argument_types() if minMatchRatio is None: output = Column( sc()._jvm.io.projectglow.functions.lift_over_coordinates( _to_java_column(contigName), _to_java_column(start), _to_java_column(end), chainFile)) else: output = Column( sc()._jvm.io.projectglow.functions.lift_over_coordinates( _to_java_column(contigName), _to_java_column(start), _to_java_column(end), chainFile, minMatchRatio)) assert check_return_type(output) return output
def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: assert check_argument_types() if args.use_preprocessor: retval = MutliTokenizerCommonPreprocessor( train=train, token_type=[args.token_type, args.src_token_type], token_list=[args.token_list, args.src_token_list], bpemodel=[args.bpemodel, args.src_bpemodel], non_linguistic_symbols=args.non_linguistic_symbols, text_cleaner=args.cleaner, g2p_type=args.g2p, text_name=["text", "src_text"], ) else: retval = None assert check_return_type(retval) return retval
def _get_other_cols(row: StructExpression) -> List[Column]: assert check_argument_types() other_cols = [] if 'cm_position' in row and row.cm_position.dtype == tfloat64: other_cols.append(fx.col("cm_position").alias("position")) if 'qual' in row and row.qual.dtype == tfloat64: # -10 qual means missing other_cols.append(fx.expr("if(qual = -10, null, qual)").alias("qual")) # [] filters means PASS, null filters means missing if 'filters' in row and row.filters.dtype == tset(tstr): other_cols.append(fx.expr("if(size(filters) = 0, array('PASS'), filters)").alias("filters")) # Rename info.* columns to INFO_* if 'info' in row and isinstance(row.info.dtype, tstruct): for f in row.info: other_cols.append(fx.col(f"`info.{f}`").alias(f"INFO_{f}")) assert check_return_type(other_cols) return other_cols
def aggregate(values: Sequence["ReportedValue"]) -> Num: assert check_argument_types() for v in values: if not isinstance(v, type(values[0])): raise ValueError( f"Can't use different Reported type together: " f"{type(v)} != {type(values[0])}" ) if len(values) == 0: warnings.warn("No stats found") retval = np.nan elif isinstance(values[0], Average): retval = np.nanmean([v.value for v in values]) elif isinstance(values[0], WeightedAverage): # Excludes non finite values invalid_indices = set() for i, v in enumerate(values): if not np.isfinite(v.value) or not np.isfinite(v.weight): invalid_indices.add(i) values = [v for i, v in enumerate(values) if i not in invalid_indices] if len(values) != 0: # Calc weighed average. Weights are changed to sum-to-1. sum_weights = sum(v.weight for i, v in enumerate(values)) sum_value = sum(v.value * v.weight for i, v in enumerate(values)) if sum_weights == 0: warnings.warn("weight is zero") retval = np.nan else: retval = sum_value / sum_weights else: warnings.warn("No valid stats found") retval = np.nan else: raise NotImplementedError(f"type={type(values[0])}") assert check_return_type(retval) return retval
def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: """Build pre-processing function. Args: cls: ASRTransducerTask object. args: Task arguments. train: Training mode. Return: : Callable pre-processing function. """ assert check_argument_types() if args.use_preprocessor: retval = CommonPreprocessor( train=train, token_type=args.token_type, token_list=args.token_list, bpemodel=args.bpemodel, non_linguistic_symbols=args.non_linguistic_symbols, text_cleaner=args.cleaner, g2p_type=args.g2p, rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None, rir_apply_prob=args.rir_apply_prob if hasattr( args, "rir_apply_prob") else 1.0, noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None, noise_apply_prob=args.noise_apply_prob if hasattr( args, "noise_apply_prob") else 1.0, noise_db_range=args.noise_db_range if hasattr( args, "noise_db_range") else "13_15", speech_volume_normalize=args.speech_volume_normalize if hasattr(args, "rir_scp") else None, ) else: retval = None assert check_return_type(retval) return retval
def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: assert check_argument_types() if self.speech_name in data: # Nothing now: candidates: # - STFT # - Fbank # - CMVN # - Data augmentation pass if self.text_name in data and self.tokenizer is not None: text = data[self.text_name] tokens = self.tokenizer.text2tokens(text) text_ints = self.token_id_converter.tokens2ids(tokens) data[self.text_name] = np.array(text_ints, dtype=np.int64) assert check_return_type(data) return data
def build_model(cls, args: argparse.Namespace) -> ModNet: assert check_argument_types() # 1. frontend if args.input_size is None: # Extract features in the model frontend_class = frontend_choices.get_class(args.frontend) frontend = frontend_class(**args.frontend_conf) input_size = frontend.output_size() else: # Give features from data-loader args.frontend = None args.frontend_conf = {} frontend = None input_size = args.input_size # 4. Encoder encoder_class = encoder_choices.get_class(args.encoder) encoder = encoder_class(input_size=input_size, **args.encoder_conf) # Projection projector_class = projector_choices.get_class(args.projector) encoder_output_size = encoder.output_size() projector = projector_class(input_size=encoder_output_size, output_size=input_size) # 8. Build model model = ModNet( frontend=frontend, encoder=encoder, projector=projector, **args.model_conf, ) # FIXME(kamo): Should be done in model? # 9. Initialize if args.init is not None: initialize(model, args.init) assert check_return_type(model) return model
def hard_calls(probabilities: Union[Column, str], numAlts: Union[Column, str], phased: Union[Column, str], threshold: float = None) -> Column: """ Converts an array of probabilities to hard calls. The probabilities are assumed to be diploid. See :ref:`variant-data-transformations` for more details. Added in version 0.3.0. Examples: >>> df = spark.createDataFrame([Row(probs=[0.95, 0.05, 0.0])]) >>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False)).alias('calls')).collect() [Row(calls=[0, 0])] >>> df = spark.createDataFrame([Row(probs=[0.05, 0.95, 0.0])]) >>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False)).alias('calls')).collect() [Row(calls=[0, 1])] >>> # Use the threshold parameter to change the minimum probability required for a call >>> df = spark.createDataFrame([Row(probs=[0.05, 0.95, 0.0])]) >>> df.select(glow.hard_calls('probs', numAlts=lit(1), phased=lit(False), threshold=0.99).alias('calls')).collect() [Row(calls=[-1, -1])] Args: probabilities : The array of probabilities to convert numAlts : The number of alternate alleles phased : Whether the probabilities are phased. If phased, we expect one ``2 * numAlts`` values in the probabilities array. If unphased, we expect one probability per possible genotype. threshold : The minimum probability to make a call. If no probability falls into the range of ``[0, 1 - threshold]`` or ``[threshold, 1]``, a no-call (represented by ``-1`` s) will be emitted. If not provided, this parameter defaults to ``0.9``. Returns: An array of hard calls """ assert check_argument_types() if threshold is None: output = Column(sc()._jvm.io.projectglow.functions.hard_calls( _to_java_column(probabilities), _to_java_column(numAlts), _to_java_column(phased))) else: output = Column(sc()._jvm.io.projectglow.functions.hard_calls( _to_java_column(probabilities), _to_java_column(numAlts), _to_java_column(phased), threshold)) assert check_return_type(output) return output
def build_preprocess_fn( cls, args: argparse.Namespace, train: bool ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: assert check_argument_types() # TODO(Jing): ask Kamo if it ok to support several args, # like text_name = 'text_ref1' and 'text_ref2' if args.use_preprocessor: retval = CommonPreprocessor_multi( train=train, token_type=args.token_type, token_list=args.token_list, bpemodel=args.bpemodel, non_linguistic_symbols=args.non_linguistic_symbols, text_name=["text_ref1", "text_ref2"], text_cleaner=args.cleaner, g2p_type=args.g2p, ) else: retval = None assert check_return_type(retval) return retval
def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel: assert check_argument_types() encoder = encoder_choices.get_class(args.encoder)(**args.encoder_conf) separator = separator_choices.get_class(args.separator)( encoder.output_dim, **args.separator_conf) decoder = decoder_choices.get_class(args.decoder)(**args.decoder_conf) # 1. Build model model = ESPnetEnhancementModel(encoder=encoder, separator=separator, decoder=decoder, **args.model_conf) # FIXME(kamo): Should be done in model? # 2. Initialize if args.init is not None: initialize(model, args.init) assert check_return_type(model) return model
def _speech_process( self, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, Union[str, np.ndarray]]: assert check_argument_types() if self.speech_name in data: if self.train and (self.rirs is not None or self.noises is not None): speech = data[self.speech_name] # speech: (Nmic, Time) if speech.ndim == 1: speech = speech[None, :] else: speech = speech.T # Calc power on non silence region power = (speech[detect_non_silence(speech)]**2).mean() # 1. Convolve RIR if self.rirs is not None and self.rir_apply_prob >= np.random.random( ): speech, _ = self._convolve_rir(speech, power) # 2. Add Noise if (self.noises is not None and self.noise_apply_prob >= np.random.random()): speech, _ = self._add_noise(speech, power) speech = speech.T ma = np.max(np.abs(speech)) if ma > 1.0: speech /= ma data[self.speech_name] = speech if self.speech_volume_normalize is not None: speech = data[self.speech_name] ma = np.max(np.abs(speech)) data[self. speech_name] = speech * self.speech_volume_normalize / ma assert check_return_type(data) return data
def build_model(cls, args: argparse.Namespace) -> ESPnetEnhS2TModel: assert check_argument_types() # Build submodels in the order of subtask_series model_conf = args.model_conf.copy() for _, subtask in enumerate(args.subtask_series): subtask_conf = dict( init=None, model_conf=eval(f"args.{subtask}_model_conf") ) for attr in eval(f"{subtask}_attributes"): subtask_conf[attr] = ( getattr(args, subtask + "_" + attr, None) if getattr(args, subtask + "_" + attr, None) is not None else getattr(args, attr, None) ) if subtask in ["asr", "st", "diar"]: m_subtask = "s2t" elif subtask in ["enh"]: m_subtask = subtask else: raise ValueError(f"{subtask} not supported.") logging.info(f"Building {subtask} task model, using config: {subtask_conf}") model_conf[f"{m_subtask}_model"] = name2task[subtask].build_model( argparse.Namespace(**subtask_conf) ) # 8. Build model model = ESPnetEnhS2TModel(**model_conf) # FIXME(kamo): Should be done in model? # 9. Initialize if args.init is not None: initialize(model, args.init) assert check_return_type(model) return model