def set_document(self, file_path): """ :param file_path: :return: """ passage = file_ops.read_utf8(file_path) self.textEdit.clear() self.textEdit.setText(passage)
def test_acs_large(self): """ Test on actual documents """ f_one = 'test_data/full_test/one.txt' f_two = 'test_data/full_test/two.txt' f_three = 'test_data/full_test/three.txt' one = file_ops.read_utf8(f_one) two = file_ops.read_utf8(f_two) three = file_ops.read_utf8(f_three) one_two = app.acs_all(one, two) one_three = app.acs_all(one, three) two_three = app.acs_all(two, three) # TODO: replace with check for static string strip_a = strip_set(one_two) strip_b = strip_set(one_three) strip_c = strip_set(two_three) in_all = strip_a & strip_b & strip_c self.assertGreaterEqual(len(in_all), 1)
def process(self): """ Perform processing Creates raw and preprocessed versions of the input file as well as a json file representing the models.Document """ start_time = time.time() name = path.get_name(self.file_name, extension=False) output_name = name + PREPROCESS_SUFFIX in_file = self.file_name out_file = os.path.join(self.output_dir, output_name) if file_ops.exists(out_file): # Already preprocessed return if in_file.endswith('.tei') or in_file.endswith('.xml'): reader = TEIReader(in_file) raw_text, metadata = reader.read() else: raw_text = file_ops.read_utf8(in_file) metadata = {} raw_file = os.path.join(self.output_dir, 'raw' + os.sep, name + PLAIN_SUFFIX) file_ops.write_utf8(raw_file, raw_text) processed_text = self.standardizer.standardize(raw_text) pre_file = os.path.join(self.output_dir, 'pre' + os.sep, name + PLAIN_SUFFIX) file_ops.write_utf8(pre_file, processed_text) out_document = Document(file_name=self.file_name, raw_file_name=raw_file, pre_file_name=pre_file, metadata=metadata) processed_dict = out_document.to_dict() file_ops.write_json_utf8(out_file, processed_dict) duration = time.time() - start_time self._log_duration(duration, self.file_name, len(raw_text))
def pre_body(self): """ Get the preprocessed body of the file """ return file_ops.read_utf8(self.pre_file_name)
def raw_body(self): """ Get the body of the file """ return file_ops.read_utf8(self.raw_file_name)
def _read_file(self, file_name): full_name = self._get_test_file_name(file_name) return file_ops.read_utf8(full_name)