Beispiel #1
0
    def create_flair_corpus(self, train_samples_path, test_samples_path,
                            val_samples_path):
        """
        Create a flair Corpus object and saive it to train, test, validation files.
        :param train_samples_path: Path to train samples
        :param test_samples_path: Path to test samples
        :param val_samples_path: Path to validation samples
        :return:
        """
        if not path.exists("flair_train.txt"):
            train_samples = InputSample.read_dataset_json(train_samples_path)
            train_tagged = [
                sample for sample in train_samples if len(sample.spans) > 0
            ]
            print(
                f"Kept {len(train_tagged)} train samples after removal of non-tagged samples"
            )
            train_data = InputSample.create_conll_dataset(train_tagged)
            self.to_flair(train_data, outfile="flair_train.txt")

        if not path.exists("flair_test.txt"):
            test_samples = InputSample.read_dataset_json(test_samples_path)
            test_data = InputSample.create_conll_dataset(test_samples)
            self.to_flair(test_data, outfile="flair_test.txt")

        if not path.exists("flair_val.txt"):
            val_samples = InputSample.read_dataset_json(val_samples_path)
            val_data = InputSample.create_conll_dataset(val_samples)
            self.to_flair(val_data, outfile="flair_val.txt")
Beispiel #2
0
    def create_flair_corpus(self, train_samples_path, test_samples_path, val_samples_path):
        if not path.exists("flair_train.txt"):
            train_samples = read_synth_dataset(train_samples_path)
            train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]
            print("Kept {} train samples after removal of non-tagged samples".format(len(train_tagged)))
            train_data = InputSample.create_conll_dataset(train_tagged)
            self.to_flair(train_data, outfile="flair_train.txt")

        if not path.exists("flair_test.txt"):
            test_samples = read_synth_dataset(test_samples_path)
            test_data = InputSample.create_conll_dataset(test_samples)
            self.to_flair(test_data, outfile="flair_test.txt")

        if not path.exists("flair_val.txt"):
            val_samples = read_synth_dataset(val_samples_path)
            val_data = InputSample.create_conll_dataset(val_samples)
            self.to_flair(val_data, outfile="flair_val.txt")
Beispiel #3
0
def test_to_conll():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    conll = InputSample.create_conll_dataset(input_samples)

    sentences = conll['sentence'].unique()
    assert len(sentences) == len(input_samples)
    def _to_feature_set(self, dataset: List[InputSample]):

        samples_conll = InputSample.create_conll_dataset(dataset)
        sentences = samples_conll.groupby("sentence")[[
            "text", "pos", "label"
        ]].apply(lambda x: x.values.tolist())

        X_train = [self.sent2features(s) for s in sentences]
        y_train = [self.sent2labels(s) for s in sentences]
        return X_train, y_train