def test_r_group_table(self): # Define a simple table structure table = [Sentence('R1 R2'), Sentence('1a CH3 C'), Sentence('1b Me Br')] output = r_group.resolve_r_group_grid(table) r_groups_list = r_group.separate_duplicate_r_groups(output) # Test the first r_group pair var1, value1, labels1 = r_groups_list[0][0].convert_to_tuple() var2, value2, labels2 = r_groups_list[0][1].convert_to_tuple() self.assertEqual(var1.text, 'R1') self.assertEqual(value1.text, 'CH3') self.assertEqual(labels1[0].text, '1a') self.assertEqual(var2.text, 'R2') self.assertEqual(value2.text, 'C') self.assertEqual(labels2[0].text, '1a') # Test the second r_group pair var1, value1, labels1 = r_groups_list[1][0].convert_to_tuple() var2, value2, labels2 = r_groups_list[1][1].convert_to_tuple() self.assertEqual(var1.text, 'R1') self.assertEqual(value1.text, 'Me') self.assertEqual(labels1[0].text, '1b') self.assertEqual(var2.text, 'R2') self.assertEqual(value2.text, 'Br') self.assertEqual(labels2[0].text, '1b')
def test_r_group_assignment(self): """ Test assignment of multiple lines """ sentences = [Sentence('R1 = R2 = H'), Sentence('R1 = R2 = Ac')] out = [] for sentence in sentences: r_groups = r_group.detect_r_group_from_sentence(sentence, indicator='=') r_groups = r_group.standardize_values(r_groups) # Resolving positional labels where possible for 'or' cases r_groups = r_group.filter_repeated_labels(r_groups) # Separate duplicate variables into separate lists r_groups_list = r_group.separate_duplicate_r_groups(r_groups) out.append(r_groups_list) self.assertEqual(out[0][0][0].var.text, 'R1') self.assertEqual(out[0][0][0].value.text, 'R2') self.assertEqual(out[0][0][1].var.text, 'R2') self.assertEqual(out[0][0][1].value.text, '[H]') self.assertEqual(out[1][0][0].var.text, 'R1') self.assertEqual(out[1][0][0].value.text, 'R2') self.assertEqual(out[1][0][1].var.text, 'R2') self.assertEqual(out[1][0][1].value.text, 'Ac')
def test_r_group_simple_table(self): # Define a simple table structure table = [Sentence('R'), Sentence('1a CH3'), Sentence('1b Me')] output = r_group.resolve_r_group_grid(table) var, value, labels = output[0].convert_to_tuple() var2, value2, labels2 = output[1].convert_to_tuple() # tuple_output = [ (var.text, value.text, labels.text) for var, value, labels in output[0].convert_to_tuple()] self.assertEqual(var.text, 'R') self.assertEqual(value.text, 'CH3') self.assertEqual(labels[0].text, '1a') self.assertEqual(var2.text, 'R') self.assertEqual(value2.text, 'Me') self.assertEqual(labels2[0].text, '1b')
def do_parse(self, input, expected): s = Sentence(input) log.debug(s) log.debug(s.tagged_tokens) result = next(mp_phrase.scan(s.tagged_tokens))[0] log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
def read_label(fig, label, whitelist=LABEL_WHITELIST): """ Reads a label paragraph objects using ocr :param numpy.ndarray img: Input unprocessedimage :param Label label: Label object containing appropriate bounding box :rtype List[List[str]] """ size = 5 img = convert_greyscale(fig.img) cropped_img = crop(img, label.left, label.right, label.top, label.bottom) padded_img = pad(cropped_img, size, mode='constant', constant_values=(1, 1)) text = get_text(padded_img, x_offset=label.left, y_offset=label.top, psm=PSM.SINGLE_BLOCK, whitelist=whitelist) if not text: label.text = [] return label, 0 raw_sentences = get_sentences(text) if len(raw_sentences) is not 0: # Tag each sentence tagged_sentences = [Sentence(sentence, word_tokenizer=ChemSchematicResolverTokeniser(), parsers=[LabelParser()]) for sentence in raw_sentences] else: tagged_sentences = [] label.text = tagged_sentences # Calculating average confidence for the block confidences = [t.confidence for t in text] avg_conf = np.mean(confidences) log.info('Confidence in OCR: %s' % avg_conf) return label, avg_conf
def do_parse(self, input, expected): s = Sentence(input) results = [] for result, *_ in mc_phrase.scan(s.tagged_tokens): results.append(etree.tostring(result, encoding='unicode')) log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) self.assertEqual(expected, results)
def test_duplicate_r_group_vars_in_one_sentence(self): sent = Sentence('A R1=H R2=NH B R1=H R2=C') # sent = Sentence(text=[Token('A', 0, 1), Token('R1', 2, 3), Token('=', 4, 5), Token('H', 6, 7), # Token('R2', 8, 9), Token('=', 10, 11), Token('NH', 12, 13), # Token('B', 14, 15), Token('R1', 16, 17), Token('=', 18, 19), Token('H', 20, 21), # Token('R2', 21, 22), Token('=', 23, 24), Token('H', 25, 26)], # start=0, # end=26, # sentence_tokenizer=ChemSentenceTokenizer(), # word_tokenizer=ChemWordTokenizer(), # lexicon=ChemLexicon(), # abbreviation_detector=ChemAbbreviationDetector(), # pos_tagger=ChemCrfPosTagger(), # ChemPerceptronTagger() # ner_tagger=CemTagger() # ) var_value_pairs = r_group.detect_r_group_from_sentence(sent) r_groups = r_group.get_label_candidates(sent, var_value_pairs) r_groups = r_group.standardize_values(r_groups) # Resolving positional labels where possible for 'or' cases r_groups = r_group.filter_repeated_labels(r_groups) # Separate duplicate variables into separate lists r_groups_list = r_group.separate_duplicate_r_groups(r_groups) output = [] for r_groups in r_groups_list: output.append(r_group.convert_r_groups_to_tuples(r_groups))
def do_parse(self, input, expected): s = Sentence(input) log.debug(s) log.debug(s.tagged_tokens) result = [c.serialize() for c in TemParser().parse(s.tagged_tokens)] if len(result) == 0: result = [''] self.assertEqual(expected, result[0])
def do_parse(self, input, expected): s = Sentence(input) log.debug(s) log.debug(s.tagged_tokens) result = next(ir.scan(s.tagged_tokens))[0] log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) self.assertEqual(expected, etree.tostring(result, encoding='unicode')) for c in IrParser().parse(s.tagged_tokens): print(c.serialize())
def do_parse(self, input, expected): s = Sentence(input) log.debug(s) log.debug(s.tagged_tokens) results = [] for i, r in enumerate(chemical_label_phrase.scan(s.tagged_tokens)): log.debug(etree.tostring(r[0], pretty_print=True, encoding='unicode')) results.append(etree.tostring(r[0], encoding='unicode')) self.assertEqual(expected, results)
def test_mp1(self): # Declaration s = Sentence( 'Colorless solid (81% yield, 74.8 mg, 0.22 mmol); mp 77.2–77.5 °C.' ) expected = '<mp_phrase><mp><value>77.2–77.5</value><units>°C</units></mp></mp_phrase>' # Testing result = next(mp_phrase.scan(s.tagged_tokens))[0] #Assertion self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
def test_label_parsing(self): test_sentence = Sentence('3', parsers=[LabelParser()]) self.assertEqual(test_sentence.records.serialize(), [{ 'labels': ['3'] }])