コード例 #1
0
    def _add_metadata(self, corpus: Corpus) -> Corpus:
        if (corpus is None or "path" not in corpus.domain
                or self._meta_data is None
                or (self.META_DATA_FILE_KEY not in self._meta_data.columns
                    and self.CONLLU_META_DATA not in self._meta_data.columns)):
            return corpus

        if self.is_conllu:
            df = self._meta_data.set_index(self.CONLLU_META_DATA)
            path_column = corpus.get_column_view("utterance")[0]
        else:
            df = self._meta_data.set_index(
                self.startdir + self._meta_data[self.META_DATA_FILE_KEY])
            path_column = corpus.get_column_view("path")[0]

        if len(df.index.drop_duplicates()) != len(df.index):
            df = df[~df.index.duplicated(keep='first')]
        filtered = df.reindex(path_column)
        for name, column in filtered.iteritems():
            data = column.astype(str).values
            val_map, vals, var_type = guess_data_type(data)
            values, variable = sanitize_variable(val_map,
                                                 vals,
                                                 data,
                                                 var_type, {},
                                                 name=get_unique_names(
                                                     corpus.domain, name))
            corpus = corpus.add_column(variable, values, to_metas=True)
        return corpus
コード例 #2
0
    def _add_metadata(self, corpus: Corpus) -> Corpus:
        if "path" not in corpus.domain or self._meta_data is None \
                or self.META_DATA_FILE_KEY not in self._meta_data.columns:
            return corpus

        df = self._meta_data.set_index(
            self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
        )
        path_column = corpus.get_column_view("path")[0]
        if len(df.index.drop_duplicates()) != len(df.index):
            df = df[~df.index.duplicated(keep='first')]
        filtered = df.reindex(path_column)
        for column in filtered.columns:
            corpus = corpus.add_column(
                StringVariable(get_unique_names(corpus.domain, column)),
                filtered[column].to_numpy(),
                to_metas=True
            )

        return corpus