Beispiel #1
0
    def test_filecount(self):
        output_json = convert_sectlabel_to_json(SECTLABEL_FILENAME)
        output = output_json["parse_sect"]

        file_numbers = [each_line["file_no"] for each_line in output]
        file_numbers = set(file_numbers)
        assert len(file_numbers) == 40  # number of files expected
Beispiel #2
0
    def get_lines_labels(self, filename: str) -> (List[str], List[str]):
        parsect_json = convert_sectlabel_to_json(filename)
        texts = []
        labels = []
        parsect_json = parsect_json["parse_sect"]

        for line_json in parsect_json:
            text = line_json["text"]
            label = line_json["label"]

            texts.append(text)
            labels.append(label)

        (train_lines, train_labels), (validation_lines, validation_labels), (
            test_lines,
            test_labels,
        ) = self.get_train_valid_test_stratified_split(texts, labels,
                                                       self.classname2idx)

        if self.dataset_type == "train":
            texts = train_lines
            labels = train_labels
        elif self.dataset_type == "valid":
            texts = validation_lines
            labels = validation_labels
        elif self.dataset_type == "test":
            texts = test_lines
            labels = test_labels

        if self.debug:
            # randomly sample `self.debug_dataset_proportion`  samples and return
            num_text = len(texts)
            np.random.seed(1729)  # so we can debug deterministically
            random_ints = np.random.randint(
                0,
                num_text - 1,
                size=int(self.debug_dataset_proportion * num_text))
            random_ints = list(random_ints)
            sample_texts = []
            sample_labels = []
            for random_int in random_ints:
                sample_texts.append(texts[random_int])
                sample_labels.append(labels[random_int])
            texts = sample_texts
            labels = sample_labels

        return texts, labels
def get_parsect_data():
    parsect_json = convert_sectlabel_to_json(SECT_LABEL_FILE)
    return parsect_json
Beispiel #4
0
    def test_label_not_empty(self):
        output_json = convert_sectlabel_to_json(SECTLABEL_FILENAME)
        output = output_json["parse_sect"]

        labels = [bool(each_line["label"]) for each_line in output]
        assert all(labels)