コード例 #1
0
    def test_r_group_table(self):

        # Define a simple table structure
        table = [Sentence('R1 R2'), Sentence('1a CH3 C'), Sentence('1b Me Br')]

        output = r_group.resolve_r_group_grid(table)
        r_groups_list = r_group.separate_duplicate_r_groups(output)

        # Test the first r_group pair
        var1, value1, labels1 = r_groups_list[0][0].convert_to_tuple()
        var2, value2, labels2 = r_groups_list[0][1].convert_to_tuple()

        self.assertEqual(var1.text, 'R1')
        self.assertEqual(value1.text, 'CH3')
        self.assertEqual(labels1[0].text, '1a')
        self.assertEqual(var2.text, 'R2')
        self.assertEqual(value2.text, 'C')
        self.assertEqual(labels2[0].text, '1a')

        # Test the second r_group pair
        var1, value1, labels1 = r_groups_list[1][0].convert_to_tuple()
        var2, value2, labels2 = r_groups_list[1][1].convert_to_tuple()

        self.assertEqual(var1.text, 'R1')
        self.assertEqual(value1.text, 'Me')
        self.assertEqual(labels1[0].text, '1b')
        self.assertEqual(var2.text, 'R2')
        self.assertEqual(value2.text, 'Br')
        self.assertEqual(labels2[0].text, '1b')
コード例 #2
0
    def test_r_group_assignment(self):
        """
        Test assignment of multiple lines
        """

        sentences = [Sentence('R1 = R2 = H'), Sentence('R1 = R2 = Ac')]
        out = []
        for sentence in sentences:
            r_groups = r_group.detect_r_group_from_sentence(sentence, indicator='=')
            r_groups = r_group.standardize_values(r_groups)

            # Resolving positional labels where possible for 'or' cases
            r_groups = r_group.filter_repeated_labels(r_groups)

            # Separate duplicate variables into separate lists
            r_groups_list = r_group.separate_duplicate_r_groups(r_groups)

            out.append(r_groups_list)

        self.assertEqual(out[0][0][0].var.text, 'R1')
        self.assertEqual(out[0][0][0].value.text, 'R2')

        self.assertEqual(out[0][0][1].var.text, 'R2')
        self.assertEqual(out[0][0][1].value.text, '[H]')

        self.assertEqual(out[1][0][0].var.text, 'R1')
        self.assertEqual(out[1][0][0].value.text, 'R2')

        self.assertEqual(out[1][0][1].var.text, 'R2')
        self.assertEqual(out[1][0][1].value.text, 'Ac')
コード例 #3
0
    def test_r_group_simple_table(self):

        # Define a simple table structure
        table = [Sentence('R'), Sentence('1a CH3'), Sentence('1b Me')]

        output = r_group.resolve_r_group_grid(table)
        var, value, labels = output[0].convert_to_tuple()
        var2, value2, labels2 = output[1].convert_to_tuple()
        # tuple_output = [ (var.text, value.text, labels.text) for var, value, labels in output[0].convert_to_tuple()]
        self.assertEqual(var.text, 'R')
        self.assertEqual(value.text, 'CH3')
        self.assertEqual(labels[0].text, '1a')
        self.assertEqual(var2.text, 'R')
        self.assertEqual(value2.text, 'Me')
        self.assertEqual(labels2[0].text, '1b')
コード例 #4
0
 def do_parse(self, input, expected):
     s = Sentence(input)
     log.debug(s)
     log.debug(s.tagged_tokens)
     result = next(mp_phrase.scan(s.tagged_tokens))[0]
     log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
     self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
コード例 #5
0
def read_label(fig, label, whitelist=LABEL_WHITELIST):
    """ Reads a label paragraph objects using ocr

    :param numpy.ndarray img: Input unprocessedimage
    :param Label label: Label object containing appropriate bounding box

    :rtype List[List[str]]
    """

    size = 5
    img = convert_greyscale(fig.img)
    cropped_img = crop(img, label.left, label.right, label.top, label.bottom)
    padded_img = pad(cropped_img, size, mode='constant', constant_values=(1, 1))
    text = get_text(padded_img, x_offset=label.left, y_offset=label.top, psm=PSM.SINGLE_BLOCK, whitelist=whitelist)
    if not text:
        label.text = []
        return label, 0
    raw_sentences = get_sentences(text)

    if len(raw_sentences) is not 0:
        # Tag each sentence
        tagged_sentences = [Sentence(sentence, word_tokenizer=ChemSchematicResolverTokeniser(),
                                           parsers=[LabelParser()]) for sentence in raw_sentences]
    else:
        tagged_sentences = []
    label.text = tagged_sentences

    # Calculating average confidence for the block
    confidences = [t.confidence for t in text]
    avg_conf = np.mean(confidences)
    log.info('Confidence in OCR: %s' % avg_conf)

    return label, avg_conf
コード例 #6
0
 def do_parse(self, input, expected):
     s = Sentence(input)
     results = []
     for result, *_ in mc_phrase.scan(s.tagged_tokens):
         results.append(etree.tostring(result, encoding='unicode'))
         log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
     self.assertEqual(expected, results)
コード例 #7
0
    def test_duplicate_r_group_vars_in_one_sentence(self):

        sent = Sentence('A R1=H R2=NH B R1=H R2=C')

        # sent = Sentence(text=[Token('A', 0, 1), Token('R1', 2, 3), Token('=', 4, 5), Token('H', 6, 7),
        #                 Token('R2', 8, 9), Token('=', 10, 11), Token('NH', 12, 13),
        #                 Token('B', 14, 15), Token('R1', 16, 17), Token('=', 18, 19), Token('H', 20, 21),
        #                 Token('R2', 21, 22), Token('=', 23, 24), Token('H', 25, 26)],
        #                 start=0,
        #                 end=26,
        #                 sentence_tokenizer=ChemSentenceTokenizer(),
        #                 word_tokenizer=ChemWordTokenizer(),
        #                 lexicon=ChemLexicon(),
        #                 abbreviation_detector=ChemAbbreviationDetector(),
        #                 pos_tagger=ChemCrfPosTagger(),  # ChemPerceptronTagger()
        #                 ner_tagger=CemTagger()
        # )

        var_value_pairs = r_group.detect_r_group_from_sentence(sent)
        r_groups = r_group.get_label_candidates(sent, var_value_pairs)
        r_groups = r_group.standardize_values(r_groups)

        # Resolving positional labels where possible for 'or' cases
        r_groups = r_group.filter_repeated_labels(r_groups)

        # Separate duplicate variables into separate lists
        r_groups_list = r_group.separate_duplicate_r_groups(r_groups)

        output = []
        for r_groups in r_groups_list:
            output.append(r_group.convert_r_groups_to_tuples(r_groups))
コード例 #8
0
 def do_parse(self, input, expected):
     s = Sentence(input)
     log.debug(s)
     log.debug(s.tagged_tokens)
     result = [c.serialize() for c in TemParser().parse(s.tagged_tokens)]
     if len(result) == 0:
         result = ['']
     self.assertEqual(expected, result[0])
コード例 #9
0
 def do_parse(self, input, expected):
     s = Sentence(input)
     log.debug(s)
     log.debug(s.tagged_tokens)
     result = next(ir.scan(s.tagged_tokens))[0]
     log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
     self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
     for c in IrParser().parse(s.tagged_tokens):
         print(c.serialize())
コード例 #10
0
 def do_parse(self, input, expected):
     s = Sentence(input)
     log.debug(s)
     log.debug(s.tagged_tokens)
     results = []
     for i, r in enumerate(chemical_label_phrase.scan(s.tagged_tokens)):
         log.debug(etree.tostring(r[0], pretty_print=True, encoding='unicode'))
         results.append(etree.tostring(r[0], encoding='unicode'))
     self.assertEqual(expected, results)
コード例 #11
0
    def test_mp1(self):

        # Declaration
        s = Sentence(
            'Colorless solid (81% yield, 74.8 mg, 0.22 mmol); mp 77.2–77.5 °C.'
        )
        expected = '<mp_phrase><mp><value>77.2–77.5</value><units>°C</units></mp></mp_phrase>'

        # Testing
        result = next(mp_phrase.scan(s.tagged_tokens))[0]

        #Assertion
        self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
コード例 #12
0
    def test_label_parsing(self):

        test_sentence = Sentence('3', parsers=[LabelParser()])
        self.assertEqual(test_sentence.records.serialize(), [{
            'labels': ['3']
        }])