def to_eaf(self, skipempty=True, pointlength=0.1): """Convert the object to an pympi.Elan.Eaf object :param int pointlength: Length of respective interval from points in seconds :param bool skipempty: Skip the empty annotations :returns: :class:`pympi.Elan.Eaf` object :raises ImportError: If the Eaf module can't be loaded. :raises ValueError: If the pointlength is not strictly positive. """ from pympi.Elan import Eaf eaf_out = Eaf() if pointlength <= 0: raise ValueError('Pointlength should be strictly positive') for tier in self.get_tiers(): eaf_out.add_tier(tier.name) for ann in tier.get_intervals(True): if tier.tier_type == 'TextTier': ann = (ann[0], ann[0]+pointlength, ann[1]) if ann[2].strip() or not skipempty: eaf_out.add_annotation(tier.name, int(round(ann[0]*1000)), int(round(ann[1]*1000)), ann[2]) return eaf_out
def to_eaf(self, skipempty=True, pointlength=0.1): """Convert the object to an pympi.Elan.Eaf object :param int pointlength: Length of respective interval from points in seconds :param bool skipempty: Skip the empty annotations :returns: :class:`pympi.Elan.Eaf` object :raises ImportError: If the Eaf module can't be loaded. :raises ValueError: If the pointlength is not strictly positive. """ from pympi.Elan import Eaf eaf_out = Eaf() if pointlength <= 0: raise ValueError('Pointlength should be strictly positive') for tier in self.get_tiers(): eaf_out.add_tier(tier.name) for ann in tier.get_intervals(True): if tier.tier_type == 'TextTier': ann = (ann[0], ann[0] + pointlength, ann[1]) if ann[2].strip() or not skipempty: eaf_out.add_annotation(tier.name, int(round(ann[0] * 1000)), int(round(ann[1] * 1000)), ann[2]) return eaf_out
seg2tier = {} with open(seg2tier_path) as f: for l in f: tok = l.strip().split() seg2tier[tok[0]] = tok[1] tiers = read_ctm(words_ctm_path, seg2tier, eaf) for tier in tiers.keys(): part = eaf.tiers[tier][2]['PARTICIPANT'] t = eaf.add_tier('{}_words'.format(tier), parent='tier', part=part, ann='Clarin-PL-service') for tier, segs in tiers.items(): for seg in segs: eaf.add_annotation('{}_words'.format(tier), seg[0], seg[1], seg[2]) if args.phones_ctm: phones_ctm_path = args.phones_ctm tiers = read_ctm(phones_ctm_path, seg2tier, eaf) for tier in tiers.keys(): part = eaf.tiers[tier][2]['PARTICIPANT'] t = eaf.add_tier('{}_phones'.format(tier), parent='tier', part=part, ann='Clarin-PL-service') for tier, segs in tiers.items(): for seg in segs: eaf.add_annotation('{}_phones'.format(tier), seg[0], seg[1], seg[2]) to_eaf(str(eaf_out_path), eaf)
def make_elans(input_dir: str, output_dir: str, copy_wavs: bool): """ Make ELAN files based on filenames of WAV files and annotation from matching text file :param input_dir: Directory name of folder containing TXT and WAV audio files :param output_dir: Directory name to save EAF files into :param copy_wavs: Setting whether or not to copy the WAV file to the output dir """ # Process each file files = glob.glob(f'{input_dir}/**/*.txt', recursive=True) print(files) for filename in files: filepath, ext = os.path.splitext(filename) basename = os.path.splitext(os.path.basename(filepath))[0] subdirname = os.path.basename(os.path.dirname(filepath)) sex = subdirname[0] participant = subdirname[1:] # SEX :== m | f # SPEAKER_ID :== <INITIALS><DIGIT> # INITIALS :== speaker initials, 3 letters # DIGIT :== number 0-9 to differentiate speakers with identical initials # print(filename) # input/dr1/fmem0/sa2.txt # print(filepath) # input/dr1/fmem0/sa2 # print(subdirname) # fmem0 # print(basename) # sa2 # print(ext) # txt # Get audio file duration - use this as the EAF annotation's end timeslot # duration = int(librosa.get_duration(filename=os.path.join(input_dir, filename))*1000) # Get annotation from the text file matching on file basename with open(filename, 'r', encoding='utf-8') as text_file: annotation = text_file.read() annotation_split = annotation.split() start = int(annotation_split[0]) duration = int(annotation_split[1]) # convert audio samples to seconds to ms duration = int(duration / 16000 * 1000) annotation_text = " ".join(annotation_split[2:]) # Add any annotation cleaning here # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation) print(start, duration, annotation_text) # Make EAF file output_eaf = Eaf() output_eaf.add_tier('default', part=participant) output_eaf.add_annotation('default', start, duration, annotation_text) output_eaf.add_linked_file( os.path.join(output_dir, f'{subdirname}-{basename}.wav')) output_eaf.to_file( os.path.join(output_dir, f'{subdirname}-{basename}.eaf')) # Copy WAV? # if copy_wavs: shutil.copyfile( f'{filepath}.wav', os.path.join(output_dir, f'{subdirname}-{basename}.wav')) print('>>> Done')