def _add_metadata(self, corpus: Corpus) -> Corpus: if (corpus is None or "path" not in corpus.domain or self._meta_data is None or (self.META_DATA_FILE_KEY not in self._meta_data.columns and self.CONLLU_META_DATA not in self._meta_data.columns)): return corpus if self.is_conllu: df = self._meta_data.set_index(self.CONLLU_META_DATA) path_column = corpus.get_column_view("utterance")[0] else: df = self._meta_data.set_index( self.startdir + self._meta_data[self.META_DATA_FILE_KEY]) path_column = corpus.get_column_view("path")[0] if len(df.index.drop_duplicates()) != len(df.index): df = df[~df.index.duplicated(keep='first')] filtered = df.reindex(path_column) for name, column in filtered.iteritems(): data = column.astype(str).values val_map, vals, var_type = guess_data_type(data) values, variable = sanitize_variable(val_map, vals, data, var_type, {}, name=get_unique_names( corpus.domain, name)) corpus = corpus.add_column(variable, values, to_metas=True) return corpus
def _add_metadata(self, corpus: Corpus) -> Corpus: if "path" not in corpus.domain or self._meta_data is None \ or self.META_DATA_FILE_KEY not in self._meta_data.columns: return corpus df = self._meta_data.set_index( self.startdir + self._meta_data[self.META_DATA_FILE_KEY] ) path_column = corpus.get_column_view("path")[0] if len(df.index.drop_duplicates()) != len(df.index): df = df[~df.index.duplicated(keep='first')] filtered = df.reindex(path_column) for column in filtered.columns: corpus = corpus.add_column( StringVariable(get_unique_names(corpus.domain, column)), filtered[column].to_numpy(), to_metas=True ) return corpus