def from_dict(d): """ Convert dict representation to MatchSet :param d: dict representation of a MatchSet :return: MatchSet """ matches = [Match.from_dict(m) for m in d['matches']] alpha = Document.from_json(d['alpha_doc']) beta = Document.from_json(d['beta_doc']) return MatchSet(alpha_doc=alpha, beta_doc=beta, matches=matches)
def process_parallel_worker(a, output_dir, gap_length, match_length, b, comparator): """ Worker for processing two files at a time in parallel """ comparator_path = COMPARATOR_PATH.format(comparator) comparator = importlib.import_module(comparator_path) pro = processor.Processor(output_dir=output_dir, comparator=comparator, gap_length=gap_length, match_length=match_length, percentage_match_length=None) alpha = Document.from_json(a) beta = Document.from_json(b) pro.process(alpha_document=alpha, beta_document=beta)
def process_serial(args, alpha_files, beta_files): """ Process on a single thread """ comparator_path = COMPARATOR_PATH.format(args.comparator) comparator = importlib.import_module(comparator_path) pro = processor.Processor(output_dir=args.output_dir, comparator=comparator, gap_length=args.gap_length, match_length=args.match_length, percentage_match_length=None) compared = [] for a, b in itertools.product(alpha_files, beta_files): this_set = sorted([a, b]) if a != b and this_set not in compared: alpha = Document.from_json(a) beta = Document.from_json(b) pro.process(alpha_document=alpha, beta_document=beta) compared.append(this_set) return len(compared)
def setUp(self): self.passages_a = [chr(i + ord('a')) for i in xrange(10)] self.passages_b = [chr(i + ord('A')) for i in xrange(10)] self.file_a = 'models/test_data/match_set_test.json' self.document_a = Document.from_json(self.file_a) self.file_b = 'models/test_data/match_set_test2.json' self.document_b = Document.from_json(self.file_b) self.matches = [] self.singlet_pairs = [] for i in xrange(len(self.passages_a)): a = MatchHalf(passage=self.passages_a[i]) b = MatchHalf(passage=self.passages_b[i]) s_pair = (a, b) self.singlet_pairs.append(s_pair) # Alpha/beta need to be actual documents, not names self.matches = Processor.singlet_pairs_to_matches(alpha=self.document_a, beta=self.document_b, singlet_pairs=self.singlet_pairs) self.match_set = MatchSet(alpha_doc=self.document_a, beta_doc=self.document_b, matches=self.matches)
def setUp(self): self.passages_a = [chr(i + ord('a')) for i in xrange(10)] self.passages_b = [chr(i + ord('A')) for i in xrange(10)] self.file_a = 'models/test_data/match_set_test.json' self.document_a = Document.from_json(self.file_a) self.file_b = 'models/test_data/match_set_test2.json' self.document_b = Document.from_json(self.file_b) self.matches = [] self.singlet_pairs = [] for i in xrange(len(self.passages_a)): a = MatchHalf(passage=self.passages_a[i]) b = MatchHalf(passage=self.passages_b[i]) s_pair = (a, b) self.singlet_pairs.append(s_pair) # Alpha/beta need to be actual documents, not names self.matches = Processor.singlet_pairs_to_matches( alpha=self.document_a, beta=self.document_b, singlet_pairs=self.singlet_pairs) self.match_set = MatchSet(alpha_doc=self.document_a, beta_doc=self.document_b, matches=self.matches)
def test_read(self): """ Test reading of TEI xml file """ real_data_file = self._get_test_file_name(TEI_ZHI) tei_doc = tei_document.TEIDocument(real_data_file) tei_data = tei_doc.get_data() tei_body = tei_data['body'] r = reader.TEIReader(real_data_file) read_body, read_metadata = r.read() self.assertEqual(tei_body, read_body) json_name = self._get_test_file_name(JSON_ZHI) global_doc = Document.from_json(json_name) self.assertEqual(read_body, global_doc.raw_body) self.assertEqual(read_metadata, global_doc.metadata)
def test_smoke(self): """ Smoke test - check that the preprocessor runs without exploding """ pp = Preprocessor(file_name=self.file_name, input_dir=self.input_dir, output_dir=self.output_dir) pp.process() out_dir_files = os.listdir(self.output_dir) for file_name in out_dir_files: name = utilities.path.get_name(self.file_name, extension=False) if name in file_name: file_path = os.path.join(self.output_dir, file_name) doc = Document.from_json(file_path) self.assertNotEqual(doc.pre_file_name, self.file_name) self.assertEqual(doc.file_name, 'test_preprocessed/lorem.json')