def __call__(self, current_file: ProtocolFile) -> Annotation: # get speech regions as Annotation instances speech_regions = self.sad_.get(current_file["uri"], Annotation()) # remove non-speech regions from current annotation # aka only keep speech regions try: annotation = current_file["annotation"] return annotation.crop(speech_regions.get_timeline()) # this haapens when current_file has no "annotation" key # (e.g. for file1 and file2 in speaker verification trials) # in that case, we return speech regions directly except KeyError: return speech_regions
def getAnnotations(self): """ annotations of cluster result Returns: -------- annotation: pyannote.core.Annotation todo: add warning when clusters is empty """ annotation = Annotation(uri=self.uri, modality='speaker') for cluster in self.clusters: for seg in cluster.segments: annotation[seg] = cluster.label return annotation
def run(self): with self.in_subtitles().open('r') as fp: transcription = pyannote.core.json.load(fp) annotation = Annotation() label = 0 for start, end, edge in transcription.ordered_edges_iter(data=True): if 'subtitle' not in edge: continue segment = Segment(start, end) annotation[segment] = label label += 1 annotation = annotation.anonymize_labels(generator='string') with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp)
def __call__(self): # list of chronologically sorted list of shots graph = self._threads_graph() threads = [sorted(cc) for cc in nx.connected_components(graph)] annotation = Annotation() labelGenerator = getLabelGenerator() # chronologically sorted threads (based on their first shot) for thread in sorted(threads, key=lambda thread: thread[0]): label = next(labelGenerator) for shot in thread: annotation[shot] = label return annotation.smooth()
def __call__(self, current_file, annotated=False): # speech activity detection soft_sad = self.sad_(current_file) hard_sad = self.sad_binarize_.apply(soft_sad, dimension=self.sad__dimension) # speaker change detection soft_scd = self.scd_(current_file) hard_scd = self.scd_peak_.apply(soft_scd, dimension=self.scd__dimension) # speech turns speech_turns = hard_scd.crop(hard_sad) if annotated: speech_turns = speech_turns.crop(get_annotated(current_file)) # remove small speech turns emb = self.emb_(current_file) speech_turns = [ speech_turn for speech_turn in speech_turns if len(emb.crop(speech_turn, mode='loose')) > 0 ] # weights weight = self.weight_(current_file) # speech turns embedding to_stack = [ np.mean(emb.crop(speech_turn, mode='loose') * (1 - weight.crop(speech_turn, mode='loose')), axis=0) for speech_turn in speech_turns ] if len(to_stack) < 1: return None fX = l2_normalize(np.vstack(to_stack)) # speech turn clustering cluster_labels = self.cls_.apply(fX) # build hypothesis from clustering results hypothesis = Annotation(uri=current_file['uri']) for speech_turn, label in zip(speech_turns, cluster_labels): hypothesis[speech_turn] = label return hypothesis
def __call__(self, current_file: ProtocolFile) -> Annotation: uri = current_file["uri"] annotation = Annotation(uri=uri) try: turns = self.data_.get_group(uri).iterrows() except KeyError: turns = [] for i, turn in turns: segment = Segment(turn.start, turn.start + turn.duration) if not segment: msg = f"Found empty segment in {self.rttm} for file {uri} around t={turn.start:.3f}s" raise ValueError(msg) annotation[segment, i] = turn.speaker return annotation
def test_labels(annotation): assert annotation.labels() == ['Leonard', 'Penny', 'Sheldon'] assert annotation.get_labels(Segment(8, 10)) == {'Penny', 'Sheldon'} expected_res = Annotation( uri='TheBigBangTheory.Season01.Episode01', modality='speaker') expected_res[Segment(3, 5), '_'] = 'Kaley Cuoco' expected_res[Segment(5.5, 7), '_',] = 'Johnny Galecki' expected_res[Segment(8, 10), '_'] = 'Kaley Cuoco' expected_res[Segment(8, 10), 'anything'] = 'Jim Parsons' mapping = {'Penny': 'Kaley Cuoco', 'Sheldon': 'Jim Parsons', 'Leonard': 'Johnny Galecki'} assert annotation.rename_labels(mapping) == expected_res
def _xxx_iter(self, subset): for current_trial in self._xxx_try_iter(subset): annotated = current_trial['try_with'] uri = current_trial['uri'] annotation = Annotation(uri=uri) label = current_trial['reference'] for s, segment in enumerate(annotated): annotation[segment, s] = label yield { 'database': 'VoxCeleb', 'uri': uri, 'annotated': annotated, 'annotation': annotation }
def read_annotation(filename, annotation_type=None, skip_tokens=[]): ''' read HTK label into pyannote Annotation ''' annotation = Annotation(uri=annotation_type) if os.path.isfile(filename): with open(filename) as fid: for line in fid.readlines(): start, end, label = line.rstrip().split() # convert to seconds start = int(start) / 10000000. end = int(end) / 10000000. label = label.upper() if label not in skip_tokens: annotation[Segment(start, end)] = label return annotation
def run(self): # XML files generated by Gregory TEMPLATE = '{workdir}/external/gregory/{exp}/{episode}.{identifier:05d}.{i:02d}.{modality}.xml' episode = pyannote_workflows.tasks.tvd_dataset.get_episode( self.in_sequences().task) talkingFace = Annotation() with self.in_sequences().open('r') as g: for seq_line in g: # for each test sequence, load Gregory's results # and keep only talking faces (with original face track ID) seq_tokens = seq_line.strip().split() identifier = int(seq_tokens[4]) i = int(seq_tokens[5]) start_time = float(seq_tokens[6]) path = TEMPLATE.format(workdir=self.workdir, exp=self.exp, modality=self.modality, episode=episode, identifier=identifier, i=i) try: with open(path, 'r') as h: for xml_line in h: if 'SpeechSegment' not in xml_line: continue xml_tokens = xml_line.strip().split() stime = start_time + float( xml_tokens[3].split('"')[1]) etime = start_time + float( xml_tokens[4].split('"')[1]) segment = Segment(stime, etime) talkingFace[segment, identifier] = 'talking' except: pass with self.out_put().open('w') as f: pyannote.core.json.dump(talkingFace, f)
def get_annotations(rec_name, pred_labels, label_dir, pred_dir): true_annotation = json.load_from(path.join(label_dir, rec_name + '.json')) # Modify region extent from sample index to time. for seg in true_annotation.itersegments(): new_seg = Segment(seg.start / hp.callhome_rate, seg.end / hp.callhome_rate) seg_label = true_annotation[seg] del true_annotation[seg] true_annotation[new_seg] = seg_label # Create prediction annotation. starts_ends = np.loadtxt(path.join(pred_dir, rec_name + '.csv'), dtype=int, delimiter=',', usecols=[1, 2]) pred_annotation = Annotation() for i, start_end in enumerate(starts_ends): cur_seg = Segment(start_end[0] / hp.callhome_rate, start_end[1] / hp.callhome_rate) pred_annotation[cur_seg] = pred_labels[i] return true_annotation, pred_annotation
def load_rttm(file_rttm): """Load RTTM file Parameter --------- file_rttm : `str` Path to RTTM file. Returns ------- annotations : `dict` Speaker diarization as a {uri: pyannote.core.Annotation} dictionary. """ names = [ "NA1", "uri", "NA2", "start", "duration", "NA3", "NA4", "speaker", "NA5", "NA6", ] dtype = {"uri": str, "start": float, "duration": float, "speaker": str} data = pd.read_csv( file_rttm, names=names, dtype=dtype, delim_whitespace=True, keep_default_na=False, ) annotations = dict() for uri, turns in data.groupby("uri"): annotation = Annotation(uri=uri) for i, turn in turns.iterrows(): segment = Segment(turn.start, turn.start + turn.duration) annotation[segment, i] = turn.speaker annotations[uri] = annotation return annotations
def merge_frames(df_outputs, frame_list, filename): speaker_list = df_outputs.columns.tolist() annotation = Annotation() for speaker in speaker_list: seg_start = 0 seg_end = 0 for i, label in enumerate(df_outputs[speaker]): if (label == 1) and (seg_start == 0): seg_start = float(frame_list[i][0]) elif (label == 0) and (seg_start > 0): seg_end = float(frame_list[i][1]) annotation[Segment(start=seg_start, end=seg_end)] = speaker seg_start = 0 else: seg_end = float(frame_list[i][1]) #with open('/home/lucas/PycharmProjects/MetricEmbeddingNet/rttm_out/'+filename+'.rttm', 'w') as f: # annotation.write_rttm(f) return annotation
def _subset(self, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # load annotations #path = op.join(data_dir, '{protocol}.{subset}.mdtm'.format(subset=subset, protocol=protocol)) #rttms = self.rttm_parser_.read(path) path = op.join(data_dir, '{subset}.lst'.format(subset=subset)) with open(path) as f: uris = f.readlines() uris = [x.strip() for x in uris] rttms = {} for file in listdir(op.join(data_dir, 'rttm',subset)): if subset == 'trn': rttm = read_rttm_file_trn(op.join(data_dir, 'rttm', subset, file)) else: rttm = read_rttm_file_other(op.join(data_dir, 'rttm', subset, file)) uri = rttm['uri'].iloc[0] annotation = Annotation() for index, row in rttm.iterrows(): annotation[Segment(float(row['start']), float(row['start']) + float(row['duration']))] = row['label'] rttms[uri] = annotation #By default it take all the file time path = op.join(data_dir, '{subset}.time'.format(subset=subset)) with open(path) as f: rows = f.readlines() times = {} for row in rows: kv = row.split(' ') times[kv[0]] = Segment(0, float(kv[1])) for uri in uris: annotated = times[uri] annotation = rttms[uri] current_file = { 'database': 'Albayzin2016', 'uri': uri, 'annotated': annotated, 'annotation': annotation} yield current_file
def run(self): # load face clusters with self.in_clusters().open('r') as fp: clusters = pyannote.core.json.load(fp) # load talking faces with self.in_talking().open('r') as fp: talking = pyannote.core.json.load(fp) # propagate face clusters to talking face talkingClusters = Annotation() for (segment, track), (other_segment, other_track) in talking.co_iter(clusters): if track != other_track: continue talkingClusters[segment, track] = clusters[other_segment, other_track] with self.out_put().open('w') as fp: pyannote.core.json.dump(talkingClusters, fp)
def fun(threshold): _metric = DiarizationPurityCoverageFMeasure(weighted=False) for current_file in getattr(_protocol, subset)(): uri = get_unique_identifier(current_file) uem = get_annotated(current_file) reference = current_file["annotation"] clusters = fcluster(Z[uri], threshold, criterion="distance") hypothesis = Annotation(uri=uri) for (start_time, end_time), cluster in zip(t[uri], clusters): hypothesis[Segment(start_time, end_time)] = cluster _ = _metric(reference, hypothesis, uem=uem) return 1.0 - abs(_metric)
def _subset(self, protocol, corpus, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # load annotations #path = op.join(data_dir, '{protocol}.{subset}.mdtm'.format(subset=subset, protocol=protocol)) #rttms = self.rttm_parser_.read(path) path = op.join(data_dir, '{protocol}.{corpus}.{subset}.lst'.format(subset=subset, corpus=corpus, protocol=protocol)) with open(path) as f: uris = f.readlines() uris = [x.strip() for x in uris] rttms = {} for path in listdir(op.join(data_dir, 'rttm')): if 'FACEREF' in path: continue rttm = read_rttm_file(op.join(data_dir, 'rttm', path)) uri = rttm['uri'].iloc[0] annotation = Annotation() for index, row in rttm.iterrows(): annotation[Segment(float(row['start']), float(row['start']) + float(row['duration']))] = row['label'] rttms[uri] = annotation #By default it take all the file time '''path = op.join(data_dir, '{protocol}.{corpus}.{subset}.time'.format(subset=subset, corpus=corpus, protocol=protocol)) with open(path) as f: rows = f.readlines() times = {} for row in rows: kv = row.split(' ') times[kv[0]] = Segment(0, float(kv[1]))''' for uri in uris: #annotated = times[uri] annotation = rttms[uri] current_file = { 'database': 'RTVE2018', 'uri': uri, # 'annotated': annotated, 'annotation': annotation} yield current_file
def _parse_groundtruth(self): for element in self.elements: groundtruthhandle = os.path.join(self.data_in_base_dir, element['id'], element['id'] + '.txt') lines = [line.strip() for line in open(groundtruthhandle, 'r').readlines()] reference = Annotation(uri=element['id']) speakers = [] for line in lines: # check whether tab or space delimited entry = line.split(' ') if len(entry) < 3: entry = line.split('\t') ref_start = self._adjust_time(float(entry[0])) ref_end = self._adjust_time(float(entry[1])) ref_spkr = (entry[2]).replace(':', '') reference[Segment(ref_start, ref_end)] = ref_spkr speakers.append(ref_spkr) element['reference'] = reference element['speakers'] = set(speakers)
def define_hypothesis(sequences, cluster_mapping): hypothesis = Annotation() config = SPDR_Util.load_config() scaledown = config['hypothesis']['scaledown'] i = 0 while i < len(sequences): start = sequences[i].start end = sequences[i].end while i < len(sequences) - 1 and cluster_mapping[i] == cluster_mapping[ i + 1]: end = sequences[i + 1].end i += 1 start = start if not scaledown else start / 1000 end = end if not scaledown else end / 1000 hypothesis[Segment(float(start), float(end))] = cluster_mapping[i] i += 1 return hypothesis
def xxx_iter(self, voxceleb, subset): """Iterate on VoxCeleb files Each file is yielded as a dictionary with the following keys: ['uri'] (`str`) Unique file identifier. Parameters ---------- voxceleb : {1, 2} VoxCeleb1 or VoxCeleb2 subset : {'dev', 'tst'} Developement or test subset. """ # load durations path = Path(__file__).parent / 'data' path = path / f'vox{voxceleb:d}_{subset}_duration.txt.gz' content = pd.read_table(path, names=['uri', 'duration'], index_col='uri', delim_whitespace=True) for uri, duration in content.itertuples(): speaker = uri.split('/')[0] segment = Segment(0, duration) annotation = Annotation(uri=uri) annotation[segment] = speaker annotated = Timeline(segments=[segment], uri=uri) current_file = { 'uri': uri, 'database': 'VoxCeleb', 'annotation': annotation, 'annotated': annotated, } yield current_file
def apply(self, current_file, model=None): duration = self.config_['sequences']['duration'] step = self.config_['sequences']['step'] if model is None: model = self.train(current_file) sequence_labeling = SequenceLabeling(model, duration=duration, step=step) prediction = sequence_labeling.apply(current_file) boundries = [0] + list( np.where(prediction.data[1:] - prediction.data[:-1] != 0)[0] + 1) pred_annotation = Annotation() for start_ind, end_ind in zip(boundries[:-1], boundries[1:]): start, end = prediction.sliding_window[ start_ind].middle, prediction.sliding_window[end_ind].middle if self.source == 'annotated' and prediction.data[start_ind] == 0: continue pred_annotation[Segment(start, end)] = prediction.data[start_ind] return pred_annotation
def merge_frames(outputs, frame_list): annotation = Annotation() for speaker_num in range(1,3): seg_start = 0 seg_end = 0 index = 0 smooth_segment = Segment(start=0, end=0) skip = 0 for i, label in enumerate(outputs): if (label == speaker_num) and (seg_start == 0) and (seg_end == 0): if (skip != 0) and (skip < 1): try: del annotation[smooth_segment] seg_start = float(smooth_segment.start) seg_end = float(smooth_segment.end) skip = 0 except: None else: seg_start = float(frame_list[i][0]) seg_end = float(frame_list[i][1]) index = i elif (label != speaker_num) and (seg_end > 0): annotation[Segment(start=seg_start, end = seg_end)] = speaker_num smooth_segment = Segment(start=seg_start, end=seg_end) skip = 1 seg_start = 0 seg_end = 0 elif (seg_end > 0) and ((i - index) == 1): index = i seg_end = frame_list[i][1] #print('step length away') elif (seg_end > 0) and ((i - index) > 1): annotation[Segment(start=seg_start, end=seg_end)] = speaker_num seg_start = float(frame_list[i][0]) seg_end = float(frame_list[i][1]) index = i elif (label != speaker_num) and (seg_end == 0): skip = skip + 1 return annotation
def get_hypothesis(hypotheses, current_file): """Get hypothesis for given file Parameters ---------- hypotheses : `dict` Speaker diarization hypothesis provided by `load_rttm`. current_file : `dict` File description as given by pyannote.database protocols. Returns ------- hypothesis : `pyannote.core.Annotation` Hypothesis corresponding to `current_file`. """ uri = current_file['uri'] if uri in hypotheses: return hypotheses[uri] # if the exact 'uri' is not available in hypothesis, # look for matching substring tmp_uri = [u for u in hypotheses if u in uri] # no matching speech turns. return empty annotation if len(tmp_uri) == 0: msg = f'Could not find hypothesis for file "{uri}"; assuming empty file.' warnings.warn(msg) return Annotation(uri=uri, modality='speaker') # exactly one matching file. return it if len(tmp_uri) == 1: hypothesis = hypotheses[tmp_uri[0]] hypothesis.uri = uri return hypothesis # more that one matching file. error. msg = f'Found too many hypotheses matching file "{uri}" ({tmp_uri}).' raise ValueError(msg.format(uri=uri, uris=tmp_uri))
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_speaker().open('r') as fp: speaker = pyannote.core.json.load(fp) timeline = Timeline() for segment, _ in speaker.itertracks(): timeline.add(segment) # fill gaps for gap in timeline.gaps(extent): if gap.duration < self.fill_gaps: timeline.add(gap) timeline = timeline.coverage() # dump as annotation... if self.to_annotation: annotation = Annotation() for s, segment in enumerate(timeline): annotation[segment] = s annotation = annotation.anonymize_labels(generator='string') with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp) # ... or as timeline else: with self.out_put().open('w') as fp: pyannote.core.json.dump(timeline, fp)
def trn_iter(self): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('identification').get_group('trn') for uri, rows in data.groupby('uri'): annotation = Annotation(uri=uri) for row in rows.itertuples(): segment = Segment(row.start, row.end) annotation[segment] = row.speaker annotated = annotation.get_timeline() current_file = { 'uri': uri, 'database': 'VoxCeleb', 'annotation': annotation, 'annotated': annotated, } yield current_file
def run(self): segmenter = pyannote.algorithms.segmentation.bic.BICSegmentation( penalty_coef=self.penalty_coef, covariance_type=self.covariance_type, min_duration=self.min_duration, precision=self.precision) with self.in_features().open('r') as fp: features = pickle.load(fp) with self.in_segmentation().open('r') as fp: segmentation = pyannote.core.json.load(fp) timeline = segmenter.apply(features, segmentation=segmentation) annotation = Annotation() for s, segment in enumerate(timeline): annotation[segment] = s with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp)
def read_annotaitons(data_dir): annotations = [] speakers = {} max_length = 0 # iterate through the text annotation files for filename in os.listdir(data_dir): if filename.endswith(".txt"): uri, _ = os.path.splitext(os.path.basename(filename)) annotation = Annotation(uri=uri) names = ['start', 'end', 'speaker', 'speakerID'] parsed_file = read_table(os.path.join(data_dir, filename), delim_whitespace=True, names=names) for t, turn in enumerate(parsed_file.itertuples()): segment = Segment(start=turn.start, end=turn.end) annotation[segment, t] = turn.speakerID if max_length < turn.end: max_length = turn.end speakers[turn.speakerID] = turn.speakerID annotations.append(annotation) return annotations, max_length, speakers
def define_hypothesis_for_embeddings(cluster_mapping, start_time=0): hypothesis = Annotation() config = SPDR_Util.load_config() segment_size = config['segment']['size'] / 1000 i = 0 start = 0.0 while i < len(cluster_mapping): start = start_time + (i * segment_size) end = start + segment_size if cluster_mapping[i] >= 0: while i < len(cluster_mapping) - 1 and cluster_mapping[ i] == cluster_mapping[i + 1]: end += segment_size i += 1 hypothesis[Segment(start, end)] = cluster_mapping[i] i += 1 return hypothesis
def iter_triplets(self, from_annotation): """Yield (anchor, positive, negative) segment triplets Parameters ---------- from_annotation : Annotation Annotation from which triplets are obtained. """ t = RandomTrackTriplets(per_label=self.per_label, yield_label=self.yield_label) annotation = Annotation(uri=from_annotation.uri, modality=from_annotation.modality) for segment, track, label in from_annotation.itertracks(label=True): if segment.duration < self.duration: continue annotation[segment, track] = label if len(annotation.labels()) < 2: return triplets = t.iter_triplets(annotation) for triplet in triplets: a, p, n = [item[0] for item in triplet] if self.duration: a, p, n = [self.pick(s) for s in (a, p, n)] if self.yield_label: a_, p_, n_ = [item[2] for item in triplet] yield (a, a_), (p, p_), (n, n_) else: yield a, p, n
def preprocess(self, embedding): """ Parameters ---------- embedding : str Path to face embeddings """ # TODO : option to only keep 'detections' # (make sure it does not alter 'starting_point' segments) names = ['time', 'track'] for i in range(128): names += ['d{0}'.format(i)] data = read_table(embedding, delim_whitespace=True, header=None, names=names) data.sort_values(by=['track', 'time'], inplace=True) starting_point = Annotation(modality='face') for track, segment in data.groupby('track').apply(self._to_segment).iteritems(): if not segment: continue starting_point[segment, track] = track return starting_point, data