Exemple #1
0
    def read_sents(self, filename, filter_ids=None):
        if self.vocab is None:
            self.vocab = Vocab()

        def convert(line, segmentation):
            line = line.strip().split()
            ret = AnnotatedSentenceInput(
                list(map(self.vocab.convert, line)) +
                [self.vocab.convert(Vocab.ES_STR)])
            ret.annotate("segment", list(map(int,
                                             segmentation.strip().split())))
            return ret

        if type(filename) != list:
            try:
                filename = ast.literal_eval(filename)
            except:
                logger.debug("Reading %s with a PlainTextReader instead..." %
                             filename)
                return super(SegmentationTextReader, self).read_sents(filename)

        max_id = None
        if filter_ids is not None:
            max_id = max(filter_ids)
            filter_ids = set(filter_ids)
        data = []
        with open(filename[0], encoding='utf-8') as char_inp,\
             open(filename[1], encoding='utf-8') as seg_inp:
            for sent_count, (char_line,
                             seg_line) in enumerate(zip(char_inp, seg_inp)):
                if filter_ids is None or sent_count in filter_ids:
                    data.append(convert(char_line, seg_line))
                if max_id is not None and sent_count > max_id:
                    break
        return data
Exemple #2
0
 def _augment_data_initial(self):
   """
   Called before loading corpus for the first time, if reload_command is given
   """
   augment_command = self.reload_command
   logger.debug('initial augmentation')
   if self._augmentation_handle is None:
     # first run
     self._augmentation_handle = Popen(augment_command + " --epoch 0", shell=True)
     self._augmentation_handle.wait()
Exemple #3
0
 def extract_to(self, in_file: str, out_file: str) -> None:
     """
 Args:
   in_file: yaml file that contains a list of dictionaries.
            Each dictionary contains:
            - wav (str): path to wav file
            - offset (float): start time stamp (optional)
            - duration (float): stop time stamp (optional)
            - speaker: speaker id for normalization (optional; if not given, the filename is used as speaker id)
   out_file: a filename ending in ".h5"
 """
     import librosa
     if not out_file.endswith(".h5"):
         raise ValueError(f"out_file must end in '.h5', was '{out_file}'")
     start_time = time.time()
     with open(in_file) as in_stream, \
          h5py.File(out_file, "w") as hf:
         db = yaml.load(in_stream, Loader=yaml.Loader)
         db_by_speaker = defaultdict(list)
         for db_index, db_item in enumerate(db):
             speaker_id = db_item.get("speaker",
                                      db_item["wav"].split("/")[-1])
             db_item["index"] = db_index
             db_by_speaker[speaker_id].append(db_item)
         for speaker_id in db_by_speaker.keys():
             data = []
             for db_item in db_by_speaker[speaker_id]:
                 y, sr = librosa.load(db_item["wav"],
                                      sr=16000,
                                      offset=db_item.get("offset", 0.0),
                                      duration=db_item.get(
                                          "duration", None))
                 if len(y) * 40 < sr:
                     logger.warn(
                         f"Encountered a short audio with only {len(y)} values. Filling up with zeros to extract filterbank features.."
                     )
                     missing_len = sr - len(y) * 40
                     y = np.pad(y, (0, missing_len), mode='constant')
                     # raise ValueError(f"encountered an empty or out of bounds segment: {db_item}")
                 logmel = speech_features.logfbank(y,
                                                   samplerate=sr,
                                                   nfilt=self.nfilt)
                 if self.delta:
                     delta = speech_features.calculate_delta(logmel)
                     features = np.concatenate([logmel, delta], axis=1)
                 else:
                     features = logmel
                 data.append(features)
             mean, std = speech_features.get_mean_std(np.concatenate(data))
             for features, db_item in zip(data, db_by_speaker[speaker_id]):
                 features = speech_features.normalize(features, mean, std)
                 hf.create_dataset(str(db_item["index"]), data=features)
     logger.debug(
         f"feature extraction took {time.time()-start_time:.3f} seconds")
Exemple #4
0
    def tokenize_stream(self, stream):
        """
    Tokenize a file-like text stream.

    Args:
      stream: A file-like stream of untokenized text
    Returns:
      A file-like stream of tokenized text

    """
        logger.debug("****** calling tokenize_stream {}".format(
            self.__class__))
        for line in stream:
            yield self.tokenize(line.strip())
def main(overwrite_args=None):

    with tee.Tee(), tee.Tee(error=True):
        argparser = argparse.ArgumentParser()
        argparser.add_argument("--dynet-mem", type=str)
        argparser.add_argument("--dynet-seed", type=int)
        argparser.add_argument("--dynet-autobatch", type=int)
        argparser.add_argument("--dynet-devices", type=str)
        argparser.add_argument("--dynet-viz",
                               action='store_true',
                               help="use visualization")
        argparser.add_argument("--dynet-gpu",
                               action='store_true',
                               help="use GPU acceleration")
        argparser.add_argument("--dynet-gpu-ids", type=int)
        argparser.add_argument("--dynet-gpus", type=int)
        argparser.add_argument("--dynet-weight-decay", type=float)
        argparser.add_argument("--dynet-profiling", type=int)
        argparser.add_argument("--settings",
                               type=str,
                               default="standard",
                               help="settings (standard, debug, or unittest)"
                               "must be given in '=' syntax, e.g."
                               " --settings=standard")
        argparser.add_argument("experiments_file")
        argparser.add_argument("experiment_name",
                               nargs='*',
                               help="Run only the specified experiments")
        argparser.set_defaults(generate_doc=False)
        args = argparser.parse_args(overwrite_args)

        if args.dynet_seed:
            random.seed(args.dynet_seed)
            np.random.seed(args.dynet_seed)

        if args.dynet_gpu:
            if settings.CHECK_VALIDITY:
                settings.CHECK_VALIDITY = False
                logger.warning(
                    "disabling CHECK_VALIDITY because it is not supported on GPU currently"
                )

        config_experiment_names = YamlPreloader.experiment_names_from_file(
            args.experiments_file)

        results = []

        # Check ahead of time that all experiments exist, to avoid bad surprises
        experiment_names = args.experiment_name or config_experiment_names

        if args.experiment_name:
            nonexistent = set(experiment_names).difference(
                config_experiment_names)
            if len(nonexistent) != 0:
                raise Exception("Experiments {} do not exist".format(",".join(
                    list(nonexistent))))

        for experiment_name in experiment_names:

            ParamManager.init_param_col()

            uninitialized_exp_args = YamlPreloader.preload_experiment_from_file(
                args.experiments_file, experiment_name)

            logger.info(f"=> Running {experiment_name}")
            logger.debug(
                f"running XNMT revision {tee.get_git_revision()} on {socket.gethostname()} on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
            )

            glob_args = uninitialized_exp_args.data.exp_global
            log_file = glob_args.log_file

            if os.path.isfile(log_file) and not settings.OVERWRITE_LOG:
                logger.warning(
                    f"log file {log_file} already exists; please delete by hand if you want to overwrite it (or use --settings debug or otherwise set OVERWRITE_LOG=True); skipping experiment.."
                )
                continue

            tee.set_out_file(log_file)

            model_file = glob_args.model_file

            uninitialized_exp_args.data.exp_global.commandline_args = args

            # Create the model
            experiment = initialize_if_needed(uninitialized_exp_args)
            ParamManager.param_col.model_file = experiment.exp_global.model_file
            ParamManager.param_col.save_num_checkpoints = experiment.exp_global.save_num_checkpoints
            ParamManager.populate()

            # Run the experiment
            eval_scores = experiment(save_fct=lambda: save_to_file(
                model_file, experiment, ParamManager.param_col))
            results.append((experiment_name, eval_scores))
            print_results(results)

            tee.unset_out_file()