Exemple #1
0
    def set_document(self, file_path):
        """

        :param file_path:
        :return:
        """
        passage = file_ops.read_utf8(file_path)

        self.textEdit.clear()
        self.textEdit.setText(passage)
Exemple #2
0
    def test_acs_large(self):
        """
        Test on actual documents
        """
        f_one = 'test_data/full_test/one.txt'
        f_two = 'test_data/full_test/two.txt'
        f_three = 'test_data/full_test/three.txt'

        one = file_ops.read_utf8(f_one)
        two = file_ops.read_utf8(f_two)
        three = file_ops.read_utf8(f_three)

        one_two = app.acs_all(one, two)
        one_three = app.acs_all(one, three)
        two_three = app.acs_all(two, three)

        # TODO: replace with check for static string
        strip_a = strip_set(one_two)
        strip_b = strip_set(one_three)
        strip_c = strip_set(two_three)

        in_all = strip_a & strip_b & strip_c
        self.assertGreaterEqual(len(in_all), 1)
Exemple #3
0
    def process(self):
        """
        Perform processing
        Creates raw and preprocessed versions of the input file
        as well as a json file representing the models.Document
        """
        start_time = time.time()
        name = path.get_name(self.file_name, extension=False)
        output_name = name + PREPROCESS_SUFFIX
        in_file = self.file_name
        out_file = os.path.join(self.output_dir, output_name)
        if file_ops.exists(out_file):
            # Already preprocessed
            return

        if in_file.endswith('.tei') or in_file.endswith('.xml'):
            reader = TEIReader(in_file)
            raw_text, metadata = reader.read()
        else:
            raw_text = file_ops.read_utf8(in_file)
            metadata = {}

        raw_file = os.path.join(self.output_dir,
                                'raw' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(raw_file, raw_text)

        processed_text = self.standardizer.standardize(raw_text)
        pre_file = os.path.join(self.output_dir,
                                'pre' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(pre_file, processed_text)

        out_document = Document(file_name=self.file_name,
                                raw_file_name=raw_file,
                                pre_file_name=pre_file,
                                metadata=metadata)
        processed_dict = out_document.to_dict()
        file_ops.write_json_utf8(out_file, processed_dict)

        duration = time.time() - start_time
        self._log_duration(duration, self.file_name, len(raw_text))
Exemple #4
0
    def process(self):
        """
        Perform processing
        Creates raw and preprocessed versions of the input file
        as well as a json file representing the models.Document
        """
        start_time = time.time()
        name = path.get_name(self.file_name, extension=False)
        output_name = name + PREPROCESS_SUFFIX
        in_file = self.file_name
        out_file = os.path.join(self.output_dir, output_name)
        if file_ops.exists(out_file):
            # Already preprocessed
            return

        if in_file.endswith('.tei') or in_file.endswith('.xml'):
            reader = TEIReader(in_file)
            raw_text, metadata = reader.read()
        else:
            raw_text = file_ops.read_utf8(in_file)
            metadata = {}

        raw_file = os.path.join(self.output_dir, 'raw' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(raw_file, raw_text)

        processed_text = self.standardizer.standardize(raw_text)
        pre_file = os.path.join(self.output_dir, 'pre' + os.sep,
                                name + PLAIN_SUFFIX)
        file_ops.write_utf8(pre_file, processed_text)

        out_document = Document(file_name=self.file_name,
                                raw_file_name=raw_file,
                                pre_file_name=pre_file,
                                metadata=metadata)
        processed_dict = out_document.to_dict()
        file_ops.write_json_utf8(out_file, processed_dict)

        duration = time.time() - start_time
        self._log_duration(duration, self.file_name, len(raw_text))
Exemple #5
0
 def pre_body(self):
     """
     Get the preprocessed body of the file
     """
     return file_ops.read_utf8(self.pre_file_name)
Exemple #6
0
 def raw_body(self):
     """
     Get the body of the file
     """
     return file_ops.read_utf8(self.raw_file_name)
Exemple #7
0
 def _read_file(self, file_name):
     full_name = self._get_test_file_name(file_name)
     return file_ops.read_utf8(full_name)
Exemple #8
0
 def _read_file(self, file_name):
     full_name = self._get_test_file_name(file_name)
     return file_ops.read_utf8(full_name)