Exemple #1
0
 def from_dict(d):
     """
     Convert dict representation to MatchSet
     :param d: dict representation of a MatchSet
     :return: MatchSet
     """
     matches = [Match.from_dict(m) for m in d['matches']]
     alpha = Document.from_json(d['alpha_doc'])
     beta = Document.from_json(d['beta_doc'])
     return MatchSet(alpha_doc=alpha,
                     beta_doc=beta,
                     matches=matches)
Exemple #2
0
def process_parallel_worker(a, output_dir, gap_length, match_length, b, comparator):
    """
    Worker for processing two files at a time in parallel
    """
    comparator_path = COMPARATOR_PATH.format(comparator)
    comparator = importlib.import_module(comparator_path)
    pro = processor.Processor(output_dir=output_dir,
                              comparator=comparator,
                              gap_length=gap_length,
                              match_length=match_length,
                              percentage_match_length=None)
    alpha = Document.from_json(a)
    beta = Document.from_json(b)
    pro.process(alpha_document=alpha, beta_document=beta)
Exemple #3
0
def process_parallel_worker(a, output_dir, gap_length, match_length, b,
                            comparator):
    """
    Worker for processing two files at a time in parallel
    """
    comparator_path = COMPARATOR_PATH.format(comparator)
    comparator = importlib.import_module(comparator_path)
    pro = processor.Processor(output_dir=output_dir,
                              comparator=comparator,
                              gap_length=gap_length,
                              match_length=match_length,
                              percentage_match_length=None)
    alpha = Document.from_json(a)
    beta = Document.from_json(b)
    pro.process(alpha_document=alpha, beta_document=beta)
Exemple #4
0
def process_serial(args, alpha_files, beta_files):
    """
    Process on a single thread
    """
    comparator_path = COMPARATOR_PATH.format(args.comparator)
    comparator = importlib.import_module(comparator_path)
    pro = processor.Processor(output_dir=args.output_dir,
                              comparator=comparator,
                              gap_length=args.gap_length,
                              match_length=args.match_length,
                              percentage_match_length=None)
    compared = []
    for a, b in itertools.product(alpha_files, beta_files):
        this_set = sorted([a, b])
        if a != b and this_set not in compared:
            alpha = Document.from_json(a)
            beta = Document.from_json(b)
            pro.process(alpha_document=alpha, beta_document=beta)
            compared.append(this_set)
    return len(compared)
Exemple #5
0
def process_serial(args, alpha_files, beta_files):
    """
    Process on a single thread
    """
    comparator_path = COMPARATOR_PATH.format(args.comparator)
    comparator = importlib.import_module(comparator_path)
    pro = processor.Processor(output_dir=args.output_dir,
                              comparator=comparator,
                              gap_length=args.gap_length,
                              match_length=args.match_length,
                              percentage_match_length=None)
    compared = []
    for a, b in itertools.product(alpha_files, beta_files):
        this_set = sorted([a, b])
        if a != b and this_set not in compared:
            alpha = Document.from_json(a)
            beta = Document.from_json(b)
            pro.process(alpha_document=alpha, beta_document=beta)
            compared.append(this_set)
    return len(compared)
Exemple #6
0
    def setUp(self):
        self.passages_a = [chr(i + ord('a')) for i in xrange(10)]
        self.passages_b = [chr(i + ord('A')) for i in xrange(10)]
        self.file_a = 'models/test_data/match_set_test.json'
        self.document_a = Document.from_json(self.file_a)
        self.file_b = 'models/test_data/match_set_test2.json'
        self.document_b = Document.from_json(self.file_b)

        self.matches = []
        self.singlet_pairs = []
        for i in xrange(len(self.passages_a)):
            a = MatchHalf(passage=self.passages_a[i])
            b = MatchHalf(passage=self.passages_b[i])
            s_pair = (a, b)
            self.singlet_pairs.append(s_pair)
            # Alpha/beta need to be actual documents, not names
        self.matches = Processor.singlet_pairs_to_matches(alpha=self.document_a,
                                                          beta=self.document_b,
                                                          singlet_pairs=self.singlet_pairs)
        self.match_set = MatchSet(alpha_doc=self.document_a,
                                  beta_doc=self.document_b,
                                  matches=self.matches)
Exemple #7
0
    def setUp(self):
        self.passages_a = [chr(i + ord('a')) for i in xrange(10)]
        self.passages_b = [chr(i + ord('A')) for i in xrange(10)]
        self.file_a = 'models/test_data/match_set_test.json'
        self.document_a = Document.from_json(self.file_a)
        self.file_b = 'models/test_data/match_set_test2.json'
        self.document_b = Document.from_json(self.file_b)

        self.matches = []
        self.singlet_pairs = []
        for i in xrange(len(self.passages_a)):
            a = MatchHalf(passage=self.passages_a[i])
            b = MatchHalf(passage=self.passages_b[i])
            s_pair = (a, b)
            self.singlet_pairs.append(s_pair)
            # Alpha/beta need to be actual documents, not names
        self.matches = Processor.singlet_pairs_to_matches(
            alpha=self.document_a,
            beta=self.document_b,
            singlet_pairs=self.singlet_pairs)
        self.match_set = MatchSet(alpha_doc=self.document_a,
                                  beta_doc=self.document_b,
                                  matches=self.matches)
Exemple #8
0
 def test_read(self):
     """
     Test reading of TEI xml file
     """
     real_data_file = self._get_test_file_name(TEI_ZHI)
     tei_doc = tei_document.TEIDocument(real_data_file)
     tei_data = tei_doc.get_data()
     tei_body = tei_data['body']
     r = reader.TEIReader(real_data_file)
     read_body, read_metadata = r.read()
     self.assertEqual(tei_body, read_body)
     json_name = self._get_test_file_name(JSON_ZHI)
     global_doc = Document.from_json(json_name)
     self.assertEqual(read_body, global_doc.raw_body)
     self.assertEqual(read_metadata, global_doc.metadata)
Exemple #9
0
 def test_read(self):
     """
     Test reading of TEI xml file
     """
     real_data_file = self._get_test_file_name(TEI_ZHI)
     tei_doc = tei_document.TEIDocument(real_data_file)
     tei_data = tei_doc.get_data()
     tei_body = tei_data['body']
     r = reader.TEIReader(real_data_file)
     read_body, read_metadata = r.read()
     self.assertEqual(tei_body, read_body)
     json_name = self._get_test_file_name(JSON_ZHI)
     global_doc = Document.from_json(json_name)
     self.assertEqual(read_body, global_doc.raw_body)
     self.assertEqual(read_metadata, global_doc.metadata)
Exemple #10
0
 def test_smoke(self):
     """
     Smoke test - check that the preprocessor runs without exploding
     """
     pp = Preprocessor(file_name=self.file_name,
                       input_dir=self.input_dir,
                       output_dir=self.output_dir)
     pp.process()
     out_dir_files = os.listdir(self.output_dir)
     for file_name in out_dir_files:
         name = utilities.path.get_name(self.file_name,
                                        extension=False)
         if name in file_name:
             file_path = os.path.join(self.output_dir, file_name)
             doc = Document.from_json(file_path)
             self.assertNotEqual(doc.pre_file_name, self.file_name)
             self.assertEqual(doc.file_name, 'test_preprocessed/lorem.json')