def run(self, input_file, opt_input_file=None, output_file=None): """Run the automatic annotation process on an input. :param input_file: (list of str) time-aligned phonemes :param opt_input_file: (list of str) ignored :param output_file: (str) the output file name :returns: (sppasTranscription) """ # Get the tier to syllabify parser = sppasRW(input_file[0]) trs_input = parser.read() tier_input = sppasFindTier.aligned_phones(trs_input) # Create the transcription result trs_output = sppasTranscription(self.name) trs_output.set_meta('syllabification_result_of', input_file[0]) # Syllabify the tier if self._options['usesphons'] is True: tier_syll = self.convert(tier_input) trs_output.append(tier_syll) if self._options['createclasses']: trs_output.append(self.make_classes(tier_syll)) # Extra tier: syllabify between given intervals if self._options['usesintervals'] is True: intervals = trs_input.find(self._options['tiername']) if intervals is None: self.logfile.print_message((info( 1264, "annotations")).format(tiername=self._options['tiername']), indent=2, status=annots.warning) else: tier_syll_int = self.convert(tier_input, intervals) tier_syll_int.set_name("SyllAlign-Intervals") tier_syll_int.set_meta('syllabification_used_intervals', intervals.get_name()) trs_output.append(tier_syll_int) if self._options['createclasses']: t = self.make_classes(tier_syll_int) t.set_name("SyllClassAlign-Intervals") trs_output.append(t) # Save in a file if output_file is not None: if len(trs_output) > 0: parser = sppasRW(output_file) parser.write(trs_output) else: raise EmptyOutputError return trs_output
def print_options(self): """Print the list of options in the user log.""" self.logfile.print_message(info(1050, "annotations") + ": ", indent=0, status=None) for k, v in self._options.items(): msg = " ... {!s:s}: {!s:s}".format(k, v) self.logfile.print_message(msg, indent=0, status=None) self.logfile.print_newline()
def check_file(filename): """Check file of any type: audio or annotated file. The extension of the filename is used to know the type of the file. :param filename: (str) name of the input file to diagnose. :returns: tuple with (status identifier, message) """ ext = os.path.splitext(filename)[1] if ext.lower() in sppas.src.audiodata.aio.extensions: return sppasDiagnosis.check_audio_file(filename) if ext.lower() in sppas.src.anndata.aio.extensions: return sppasDiagnosis.check_trs_file(filename) message = info(1006, "annotations") + \ (info(1020, "annotations")).format(extension=ext) return annots.error, message
def print_stat_item(self, step_number, value=None): """Print a statistic value in the output stream for a given step. Do not print anything if no parameters were given. :param step_number: (1..N) :param value: (str) A statistic value. Instead, print the status (enabled or disabled). """ if self.parameters is None: return if value is None: if self.parameters.get_step_status(step_number): value = info(1030, "annotations") else: value = info(1031, "annotations") self.print_item(self.parameters.get_step_name(step_number), str(value))
def print_diagnosis(self, *filenames): """Print the diagnosis of a list of files in the user report. :param filenames: (list) List of files. """ for filename in filenames: if filename is not None and os.path.exists(filename): fn = os.path.basename(filename) (s, m) = sppasDiagnosis.check_file(filename) msg = (info(1056, "annotations")).format(fn) + ": {!s:s}".format(m) self.logfile.print_message(msg, indent=0, status=None)
def print_header(self): """Print the parameters information in the output file stream.""" sppas_name = sg.__name__ + ' ' + info(1032, "annotations") \ + ' ' + sg.__version__ sppas_copy = sg.__copyright__ sppas_url = info(1033, "annotations") + ': ' + sg.__url__ sppas_contact = info(1034, "annotations") + ': ' + \ sg.__author__ + " (" + sg.__contact__ + ")" if self.logfp is not None: self.logfp.seek(0, 2) self.print_message(sppas_name) self.print_message(sppas_copy) self.print_message(sppas_url) self.print_message(sppas_contact) self.print_newline() self.print_separator() else: logging.info(sppas_name) logging.info(sppas_copy) logging.info(sppas_url) logging.info(sppas_contact)
def _phonetize(self, entry): """Phonetize a text. Because we absolutely need to match with the number of tokens, this method will always return a string: either the automatic phonetization (from dict or from phonunk) or the unk stamp. :param entry: (str) The string to be phonetized. :returns: phonetization of the given entry """ unk = symbols.unk tab = self.__phonetizer.get_phon_tokens( entry.split(), phonunk=self._options['phonunk']) tab_phones = list() for tex, p, s in tab: message = None if s == annots.error: message = (info(1110, "annotations")).format(tex) + \ info(1114, "annotations") self.logfile.print_message(message, indent=2, status=s) return [unk] else: if s == annots.warning: message = (info(1110, "annotations")).format(tex) if len(p) > 0: message = message + (info(1112, "annotations")).format(p) else: message = message + info(1114, "annotations") p = unk tab_phones.append(p) if message: self.logfile.print_message(message, indent=2, status=s) return tab_phones
def convert(self, tier): """Phonetize annotations of a tokenized tier. :param tier: (Tier) the ortho transcription previously tokenized. :returns: (Tier) phonetized tier with name "Phones" """ if tier is None: raise IOError('No given tier.') if tier.is_empty() is True: raise EmptyInputError(name=tier.get_name()) phones_tier = sppasTier("Phones") for i, ann in enumerate(tier): self.logfile.print_message( (info(1220, "annotations")).format(number=i + 1), indent=1) location = ann.get_location().copy() labels = list() # Normalize all labels of the orthographic transcription for label in ann.get_labels(): phonetizations = list() for text, score in label: if text.is_pause() or text.is_silence(): # It's in case the pronunciation dictionary # were not properly fixed. phonetizations.append(SIL) elif text.is_empty() is False: phones = self._phonetize(text.get_content()) for p in phones: phonetizations.extend(p.split(separators.variants)) # New in SPPAS 1.9.6. # - The result is a sequence of labels. # - Variants are alternative tags. tags = [sppasTag(p) for p in set(phonetizations)] labels.append(sppasLabel(tags)) phones_tier.create_annotation(location, labels) return phones_tier
def check_trs_file(filename): """Check an annotated file. Are verified: 1. the format of the file (error); 2. the file encoding (error); 3. the filename (warning). :param filename: (string) name of the input file :returns: tuple with (status identifier, message) """ status = annots.ok message = info(1000, "annotations") # test encoding try: f = codecs.open(filename, "r", sg.__encoding__) f.close() except UnicodeDecodeError: message = info(1004, "annotations") + \ (info(1026, "annotations")).format(encoding=sg.__encoding__) return annots.error, message except Exception as e: message = info(1004, "annotations") + str(e) return annots.error, message # test US_ASCII in filename if all(ord(x) < 128 for x in filename) is False: message = info(1002, "annotations") + info(1022, "annotations") return annots.warning, message # test whitespace in filename if " " in filename: message = info(1002, "annotations") + info(1024, "annotations") return annots.warning, message return status, message
def convert(self, phonemes, intervals=None): """Syllabify labels of a time-aligned phones tier. :param phonemes: (sppasTier) time-aligned phonemes tier :param intervals: (sppasTier) :returns: (sppasTier) """ if intervals is None: intervals = sppasSyll._phon_to_intervals(phonemes) syllables = sppasTier("SyllAlign") syllables.set_meta('syllabification_of_tier', phonemes.get_name()) for interval in intervals: # get the index of the phonemes containing the begin # of the interval start_phon_idx = phonemes.lindex( interval.get_lowest_localization()) if start_phon_idx == -1: start_phon_idx = phonemes.mindex( interval.get_lowest_localization(), bound=-1) # get the index of the phonemes containing the end of the interval end_phon_idx = phonemes.rindex(interval.get_highest_localization()) if end_phon_idx == -1: end_phon_idx = phonemes.mindex( interval.get_highest_localization(), bound=1) # syllabify within the interval if start_phon_idx != -1 and end_phon_idx != -1: self.syllabify_interval(phonemes, start_phon_idx, end_phon_idx, syllables) else: self.logfile.print_message( (info(1224, "annotations")).format(interval), indent=2, status=annots.warning) return syllables
def segment(self, audio_filename, phon_name, token_name, align_name): """Call an aligner to perform speech segmentation and manage errors. :param audio_filename: (str) the audio file name of an IPU :param phon_name: (str) file name with the phonetization :param token_name: (str) file name with the tokenization :param align_name: (str) file name to save the result WITHOUT ext. :returns: A message of the aligner in case of any problem, or an empty string if success. """ # Get the phonetization and tokenization strings to time-align. phones = "" tokens = "" if phon_name is not None: phones = self._readline(phon_name) self._aligner.set_phones(phones) self._basic_aligner.set_phones(phones) if token_name is not None: tokens = self._readline(token_name) self._aligner.set_tokens(tokens) self._basic_aligner.set_tokens(tokens) # Do not align nothing! if len(phones) == 0: self._basic_aligner.run_alignment(audio_filename, align_name) return info(1222, "annotations") # Do not align only one phoneme! if len(phones.split()) <= 1 and "-" not in phones: self._basic_aligner.run_alignment(audio_filename, align_name) return "" # Execute Alignment ret = self._aligner.check_data() ret += self._aligner.run_alignment(audio_filename, align_name) return ret
def print_annotations_header(self): """Print the parameters information in the output stream. Do not print anything if no parameters were given. """ if self.parameters is None: return self.print_message(' '*24 + info(1054, "annotations")) self.print_newline() self.print_message(' '*24 + info(1035, "annotations")) self.print_separator() self.print_newline() self.print_message(info(1036, "annotations") + ': ' + sppasTime().now) self.print_message(info(1037, "annotations") + ': ') for i in range(self.parameters.get_step_numbers()): if self.parameters.get_lang(i) is not None: self.print_item(self.parameters.get_step_name(i), self.parameters.get_lang(i)) else: self.print_item(self.parameters.get_step_name(i), "---") self.print_newline() self.print_message(info(1038, "annotations") + ': ') for sinput in self.parameters.get_sppasinput(): self.print_item(sinput) self.print_newline() self.print_message(info(1039, "annotations") + ': ') for i in range(self.parameters.get_step_numbers()): self.print_stat_item(i) self.print_newline() self.print_message(info(1040, "annotations") + ': ' + self.parameters.get_output_format()) self.print_newline()
def batch_processing(self, file_names, progress=None, output_format=annots.extension): """Perform the annotation on a set of files. The given list of inputs can be either: - a list of the files to be used as a single input: [file1, file2, ...] - a list of the files to be used as several-required-inputs: [(file1_a, file1_b), (file2_a, file2_b), ...] - a list of the files to be used as inputs and optional-inputs: [((file_1_a), (file_1_x)), ((file_2_a), (file_2_x)), ... ] - a list of the files to be used as several-required-inputs and optional-inputs: [((file1_a, file1_b), (file_1_x, file_1_y)), ...] :param file_names: (list) List of inputs :param progress: ProcessProgressTerminal() or ProcessProgressDialog() :param output_format: (str) :return: (int) Number of files processed with success """ if len(self._options) > 0: self.print_options() total = len(file_names) if total == 0: return 0 files_processed_success = 0 if progress: progress.set_header(self.name) progress.update(0, "") # Execute the annotation for each file in the list for i, input_files in enumerate(file_names): required_inputs, optional_inputs = self._split_inputs(input_files) self.print_diagnosis(*required_inputs) self.print_diagnosis(*optional_inputs) out_name = self.run_for_batch_processing(required_inputs, optional_inputs, output_format) if out_name is None: self.logfile.print_message(info(1306, "annotations"), indent=1, status=annots.info) else: files_processed_success += 1 self.logfile.print_message(out_name, indent=1, status=annots.ok) self.logfile.print_newline() if progress: progress.set_fraction(round(float((i + 1)) / float(total), 2)) # Indicate completed! if progress: progress.update( 1, (info(9000, "ui").format(files_processed_success, total))) progress.set_header("") return files_processed_success
from sppas.src.models.acm.modelmixer import sppasModelMixer from sppas.src.utils.fileutils import sppasFileUtils from ..baseannot import sppasBaseAnnotation from ..searchtier import sppasFindTier from ..annotationsexc import AnnotationOptionError from ..annotationsexc import EmptyDirectoryError from ..annotationsexc import NoInputError from .tracksio import TracksReaderWriter from .tracksgmt import TrackSegmenter from .activity import sppasActivity # --------------------------------------------------------------------------- MSG_MODEL_L1_FAILED = (info(1210, "annotations")) MSG_ALIGN_TRACK = (info(1220, "annotations")) MSG_ALIGN_FAILED = (info(1230, "annotations")) MSG_BASIC = (info(1240, "annotations")) MSG_ACTION_SPLIT_INTERVALS = (info(1250, "annotations")) MSG_ACTION_ALIGN_INTERVALS = (info(1252, "annotations")) MSG_ACTION_MERGE_INTERVALS = (info(1254, "annotations")) MSG_ACTION_EXTRA_TIER = (info(1256, "annotations")) MSG_TOKENS_DISABLED = (info(1260, "annotations")) MSG_NO_TOKENS_ALIGN = (info(1262, "annotations")) MSG_EXTRA_TIER = (info(1270, "annotations")) MSG_WORKDIR = (info(1280, "annotations")) # ---------------------------------------------------------------------------
def check_audio_file(filename): """Check an audio file. Are verified: 1. the format of the file (error); 2. the number of channels (error); 3. the sample width (error or warning); 4. the framerate (error or warning; 5. the filename (warning). :param filename: (str) name of the input file :returns: tuple with (status identifier, message) """ status = annots.ok message = "" # test file format: can we support it? try: audio = sppas.src.audiodata.aio.open(filename) fm = audio.get_framerate() sp = audio.get_sampwidth()*8 nc = audio.get_nchannels() audio.close() except UnicodeDecodeError: message = info(1004, "annotations") + \ (info(1026, "annotations")).format(encoding=sg.__encoding__) return annots.error, message except Exception as e: message = info(1004, "annotations") + str(e) return annots.error, message if nc > sppasDiagnosis.EXPECTED_CHANNELS: status = annots.error message += (info(1010, "annotations")).format(number=nc) if sp < sppasDiagnosis.EXPECTED_SAMPLE_WIDTH*8: status = annots.error message += (info(1012, "annotations")).format(sampwidth=sp) if fm < sppasDiagnosis.EXPECTED_FRAME_RATE: status = annots.error message += (info(1014, "annotations")).format(framerate=fm) if status != annots.error: if sp > sppasDiagnosis.EXPECTED_SAMPLE_WIDTH*8: status = annots.warning message += (info(1016, "annotations")).format(sampwidth=sp) if fm > sppasDiagnosis.EXPECTED_FRAME_RATE: status = annots.warning message += (info(1018, "annotations")).format(framerate=fm) # test US-ASCII chars if all(ord(x) < 128 for x in filename) is False: status = annots.warning message += info(1022, "annotations") if " " in filename: status = annots.warning message += info(1024, "annotations") # test whitespace if status == annots.error: message = info(1004, "annotations") + message elif status == annots.warning: message = info(1002, "annotations") + message else: message = info(1000, "annotations") return status, message